# Preprocessing Data

In [None]:
!pip install -q yfinance dateparser

In [None]:
# Yahoo Stock Price API package
import yfinance

# Data Manipulation packages
import pandas as pd
import numpy as np
import datetime as dt
import dateparser
import re

# Visualization
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# EDA & Preprocessor packages
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose

# ML model packages
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Evaluation Metrics packages
from sklearn.metrics import mean_squared_error, mean_absolute_error

# For Logging Purpose
# Connect to gsheets
from google.colab import auth, files, drive
import gspread
from google.auth import default
import os


# Yahoo Stock Price API package
import yfinance

# Data Manipulation packages
import pandas as pd
import numpy as np
import datetime as dt
import dateparser
import re

# Processor
from numba import njit

# Disable pandas warning
pd.options.mode.chained_assignment = None

# Visualization
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# EDA & Preprocessor packages
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose

# DL model packages
import tensorflow as tf

# Evaluation Metrics packages
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# For Logging Purpose
# Connect to gsheets
from google.colab import auth, files, drive
import gspread
from google.auth import default
from uuid import uuid4
import os

# Yahoo Stock Price API package
import yfinance

# Data Manipulation packages
import os
import pandas as pd
import numpy as np
from numba import njit
import math
import dateparser
import re

import datetime as dt

# Visualization
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# EDA & Preprocessor packages
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose

# Deeplearning Lib
import tensorflow as tf

# Evaluation Metrics packages
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# For Logging Purpose
# Connect to gsheets
from google.colab import auth, files, drive
import gspread
from google.auth import default

# Other
from typing import Tuple
from uuid import uuid4
import os

In [None]:
drive.mount("/content/drive")

In [None]:
## DATASET CONFIGURATION ##
gmt = 7 # GMT+7
today = dt.datetime.now() + dt.timedelta(hours=gmt) 
parameters = {
    "stock_market_detail": {
        "ticker": "APIC.JK",
        "start_date": "1993-02-01",
        "end_date": "2022-10-31",
        "used_cols": ["Open", "High", "Low", "Close"]
    },
    "macro_economy_features": {
        "bi_rate": False,
        "inflasi": True,
        "jisdor": True,
        "m2": True,
        "vix": True
    },
    "micro_economy_features": {
        # Choices = [APIC, BWPT, EDGE, HAIS, JRPT, MITI, PSKT, SCCO, SIDO, SMCB, TOWR, None]
        "ticker": "SIDO" # Change to 'None' to exclude micro economy feature
    },
    "preprocessing_hyperparameters": {
        "window": 24,
        "rolling_agg": "mean",
        "fillna": -99999 # will not be used if dropna == True
    },
    "preprocessing_treatments": {
        "rolling_window": False,
        "dropna": True
    },
    "feature_extraction": {
        "high_low_pct": True,
        "pct_change": True
    },
    "data_preparation": {
        "features": ["Open", "High", "Low"],
        "target_prediction": "Close",
        "test_size": .3,
        "validation_size": 0,
        "n_in": 3
    },
    "model_parameter": {
        "selected_model": "linear_regression",
        "model_options": {
            "linear_regression": {},
            "random_forest": {
                "random_state": 2022,
                "n_estimators": 100,
                "criterion": "squared_error", # ["absolute_error", "poison"]
                "max_depth": None,
                "min_samples_split": 2,
                "min_samples_leaf": 1,
                "min_weight_fraction_leaf": 0.0,
                "max_features": 1, # ["sqrt", "log", None],
                "max_leaf_nodes": None,
                "min_impurity_decrease": 0,
                "bootstrap": True,
                "oob_score": False,
                "max_samples": None
            }
        }
    }
}

model_option = {
    "linear_regression": LinearRegression,
    "random_forest": RandomForestRegressor
}

In [None]:
# Download stock market data
stock_market = yfinance.download(
    parameters["stock_market_detail"]["ticker"],
    start=parameters["stock_market_detail"]["start_date"],
    end=parameters["stock_market_detail"]["end_date"]
)

# Select only necessary features
stock_market = stock_market.loc[:, parameters["stock_market_detail"]["used_cols"]]

In [None]:
display(stock_market)

In [None]:
stock_market.info()

### Get Features

#### Makro

In [None]:
makro_config = {
    "macro_economy_features": {
        "bi_rate": {
            "read_data_config": {
                "usecols": ["Tanggal", "BI-7Day-RR"]
            },
            "cols_selector": {
                "date": "Date",
                "bi_rate": "BI-Rate"
            },
            "cols_rename": ["Date", "BI-Rate"],
            "preprocessing": {
                "merge": "left",
                "fillna": "interpolate" # After merge
            }
        },
        "inflasi": {
            "read_data_config": {
                "usecols": ["Periode", "Data Inflasi"],
                "skiprows": 4
            },
            "cols_selector": {
                "date": "Date",
                "inflasi": "Inflasi"
            },
            "cols_rename": ["Date", "Inflasi"],
            "preprocessing": {
                "merge": "left",
                "fillna": "interpolate" # After merge
            }
        },
        "jisdor": {
            "read_data_config": {
                "usecols": ["Tanggal", "Kurs"]
            },
            "cols_selector": {
                "date": "Date",
                "kurs": "Kurs"
            },
            "cols_rename": ["Date", "Kurs"],
            "preprocessing": {
                "merge": "left",
                "fillna": "interpolate" # After merge
            }
        },
        "m2": {
            "read_data_config": {
                "usecols": ["Tahun", "Uang Beredar Luas (M2)"]
            },
            "cols_selector": {
                "date": "Date",
                "m2": "M2"
            },
            "cols_rename": ["Date", "M2"],
            "preprocessing": {
                "merge": "left",
                "fillna": "interpolate" # After merge
            }
        },
        "vix": {
            "read_data_config": {
                "usecols": ["Date", "Open", "High", "Low", "Close"],
                "parse_dates": ["Date"]
            },
            "cols_selector": {
                "date": "Date",
                "value": ["Vix Open", "Vix High", "Vix Low", "Vix Close"]
            },
            "cols_rename": ["Date", "Vix Open", "Vix High", "Vix Low", "Vix Close"],
            "preprocessing": {
                "merge": "left",
                "fillna": "interpolate"
            }
        }
    }
}

In [None]:
temp_stock_market = stock_market.reset_index()
temp_stock_market["Date"] = pd.to_datetime(temp_stock_market["Date"], utc=True)
temp_stock_market["Date"] = temp_stock_market["Date"].dt.date

In [None]:
GDRIVE_PATH = "drive/MyDrive"
FEATURES_DATASET_PATH = "Datasets/Features"
MAKRO_EKONOMI_PATH = os.path.join(GDRIVE_PATH, FEATURES_DATASET_PATH, "Macro Economi")

In [None]:
if parameters["macro_economy_features"]["bi_rate"]:
  # Unpack config
  bi_rate_c = makro_config["macro_economy_features"]["bi_rate"]

  # Read Data
  bi_rate = pd.read_excel(
      os.path.join(MAKRO_EKONOMI_PATH, "BI-7Day-RR.xlsx"), 
      **bi_rate_c["read_data_config"]
      )

  # Rename
  bi_rate.columns = bi_rate_c["cols_rename"]

  # Fix date format
  bi_rate[bi_rate_c["cols_selector"]["date"]] = bi_rate[bi_rate_c["cols_selector"]["date"]].apply(dateparser.parse)
  bi_rate.sort_values(bi_rate_c["cols_selector"]["date"], inplace=True)
  bi_rate.reset_index(drop=True, inplace=True)

  # Fix data format
  bi_rate[bi_rate_c["cols_selector"]["bi_rate"]] = bi_rate[bi_rate_c["cols_selector"]["bi_rate"]].apply(lambda value: value.replace("%", ""))
  bi_rate[bi_rate_c["cols_selector"]["bi_rate"]] = bi_rate[bi_rate_c["cols_selector"]["bi_rate"]].apply(str.strip)
  bi_rate[bi_rate_c["cols_selector"]["bi_rate"]] = bi_rate[bi_rate_c["cols_selector"]["bi_rate"]].astype(float)

  # Merge
  temp_stock_market = temp_stock_market.merge(bi_rate, how=bi_rate_c["preprocessing"]["merge"], on="Date")

  if bi_rate_c["preprocessing"].get("fillna"):
    temp_stock_market[bi_rate_c["cols_selector"]["bi_rate"]] = temp_stock_market[bi_rate_c["cols_selector"]["bi_rate"]].interpolate(method="linear")

  parameters["data_preparation"]["features"] += [bi_rate_c["cols_selector"]["bi_rate"]]
  parameters["data_preparation"]["features"] = list(set(parameters["data_preparation"]["features"]))

In [None]:
if parameters["macro_economy_features"]["inflasi"]:
  # Unpack config
  inflasi_c = makro_config["macro_economy_features"]["inflasi"]

  # Read Data
  inflasi = pd.read_excel(
      os.path.join(MAKRO_EKONOMI_PATH, "Data Inflasi.xlsx"), 
      **inflasi_c["read_data_config"]
      )
  
  # Rename
  inflasi.columns = inflasi_c["cols_rename"]

  # Fix date format
  inflasi[inflasi_c["cols_selector"]["date"]] = inflasi[inflasi_c["cols_selector"]["date"]].apply(dateparser.parse)
  inflasi.sort_values(inflasi_c["cols_selector"]["date"], inplace=True)
  inflasi.reset_index(drop=True, inplace=True)

  # Fix data format
  inflasi[inflasi_c["cols_selector"]["inflasi"]] = inflasi[inflasi_c["cols_selector"]["inflasi"]].apply(lambda value: value.replace("%", ""))
  inflasi[inflasi_c["cols_selector"]["inflasi"]] = inflasi[inflasi_c["cols_selector"]["inflasi"]].apply(str.strip)
  inflasi[inflasi_c["cols_selector"]["inflasi"]] = inflasi[inflasi_c["cols_selector"]["inflasi"]].astype(float)
  temp_stock_market.Date = temp_stock_market.Date.astype("datetime64[ns]")
  # inflasi.Date = inflasi.Date.astype("datetime64[ns]")
  temp_stock_market = temp_stock_market.merge(inflasi, how=inflasi_c["preprocessing"]["merge"], on="Date")

  if inflasi_c["preprocessing"].get("fillna"):
    temp_stock_market[inflasi_c["cols_selector"]["inflasi"]] = temp_stock_market[inflasi_c["cols_selector"]["inflasi"]].interpolate(method="linear")

  parameters["data_preparation"]["features"] += [inflasi_c["cols_selector"]["inflasi"]]
  parameters["data_preparation"]["features"] = list(set(parameters["data_preparation"]["features"]))

In [None]:
if parameters["macro_economy_features"]["jisdor"]:
  # Unpack config
  jisdor_c = makro_config["macro_economy_features"]["jisdor"]

  # Read Data
  jisdor = pd.read_excel(
      os.path.join(MAKRO_EKONOMI_PATH, "Informasi Kurs Jisdor.xlsx"), 
      **jisdor_c["read_data_config"]
      )
  
  # Rename
  jisdor.columns = jisdor_c["cols_rename"]

  # Fix date format
  jisdor[jisdor_c["cols_selector"]["date"]] = jisdor[jisdor_c["cols_selector"]["date"]].apply(dateparser.parse)
  jisdor.sort_values(jisdor_c["cols_selector"]["date"], inplace=True)
  jisdor.reset_index(drop=True, inplace=True)

  temp_stock_market = temp_stock_market.merge(jisdor, how=jisdor_c["preprocessing"]["merge"], on="Date")

  if jisdor_c["preprocessing"].get("fillna"):
    temp_stock_market[jisdor_c["cols_selector"]["kurs"]] = temp_stock_market[jisdor_c["cols_selector"]["kurs"]].interpolate(method="linear")
  
  parameters["data_preparation"]["features"] += [jisdor_c["cols_selector"]["kurs"]]
  parameters["data_preparation"]["features"] = list(set(parameters["data_preparation"]["features"]))

In [None]:
if parameters["macro_economy_features"]["m2"]:
  # Unpack config
  m2_c = makro_config["macro_economy_features"]["m2"]

  # Read Data
  m2 = pd.read_excel(
      os.path.join(MAKRO_EKONOMI_PATH, "M2.xlsx"), 
      **m2_c["read_data_config"]
      )
  
  # Rename
  m2.columns = m2_c["cols_rename"]

  # Fix data format
  m2[m2_c["cols_selector"]["m2"]] = m2[m2_c["cols_selector"]["m2"]].apply(lambda value: value.replace(",", ""))
  m2[m2_c["cols_selector"]["m2"]] = m2[m2_c["cols_selector"]["m2"]].apply(str.strip)
  m2[m2_c["cols_selector"]["m2"]] = m2[m2_c["cols_selector"]["m2"]].astype(float)

  temp_stock_market = temp_stock_market.merge(m2, how=m2_c["preprocessing"]["merge"], on="Date")

  if m2_c["preprocessing"].get("fillna"):
    temp_stock_market[m2_c["cols_selector"]["m2"]] = temp_stock_market[m2_c["cols_selector"]["m2"]].interpolate(method="linear")

  parameters["data_preparation"]["features"] += [m2_c["cols_selector"]["m2"]]
  parameters["data_preparation"]["features"] = list(set(parameters["data_preparation"]["features"]))

#### VIX

In [None]:
if parameters["macro_economy_features"]["vix"]:
  vix_config = makro_config["macro_economy_features"]["vix"]
  VIX_PATH = os.path.join(GDRIVE_PATH,FEATURES_DATASET_PATH, "VIX")
  vix = pd.read_excel(os.path.join(VIX_PATH, "VIX Index.xlsx"), **vix_config["read_data_config"])
  vix.columns = vix_config["cols_rename"]

  # Merge
  temp_stock_market = temp_stock_market.merge(vix, on=vix_config["cols_selector"]["date"], how=vix_config["preprocessing"]["merge"])

  if vix_config["preprocessing"].get("fillna"):
    temp_stock_market[vix_config["cols_selector"]["value"]] = temp_stock_market[vix_config["cols_selector"]["value"]].interpolate(method="linear")
  
  parameters["data_preparation"]["features"] += vix_config["cols_selector"]["value"]
  parameters["data_preparation"]["features"] = list(set(parameters["data_preparation"]["features"]))

#### Mikro

In [None]:
mikro_config = {
    # "usecols": ["Tanggal", "ROE", "NPM", "PER", "PBV", "Rasio Lancar", "DER"],
    # "cols_rename": ["Date", "ROE", "NPM", "PER", "PBV", "Rasio Lancar", "DER"],
    "usecols": ["Tanggal", "ROE", "NPM", "PER"],
    "cols_rename": ["Date", "ROE", "NPM", "PER"],
    "preprocessing": {
        "merge": "left",
        "fillna": "interpolate" # After merge
    }
}

In [None]:
if parameters["micro_economy_features"]["ticker"]:
  MIKRO_EKONOMI_PATH = os.path.join(GDRIVE_PATH, FEATURES_DATASET_PATH, "Micro Economi")
  MIKRO_EKONOMI_FILES = os.listdir(MIKRO_EKONOMI_PATH)
  TICKER_LIST = [ticker.split(".")[0] for ticker in MIKRO_EKONOMI_FILES if ticker != "Micro Ekonomi.xlsx"]
  MIKRO_EKONOMI_FILES = [os.path.join(MIKRO_EKONOMI_PATH, filename) for filename in MIKRO_EKONOMI_FILES]
  MIKRO_EKONOMI_DICT = dict(zip(TICKER_LIST, MIKRO_EKONOMI_FILES))
  for mikro_ekonomi_ticker, filepath in MIKRO_EKONOMI_DICT.items():
    if re.match(parameters["micro_economy_features"]["ticker"], mikro_ekonomi_ticker):
      mikro_ekonomi = pd.read_excel(filepath, usecols=mikro_config["usecols"])
      mikro_ekonomi.columns = mikro_config["cols_rename"]

      temp_stock_market = temp_stock_market.merge(mikro_ekonomi, how=mikro_config["preprocessing"]["merge"], on="Date")
      mikro_config["cols_rename"].remove("Date")

      if mikro_config["preprocessing"].get("fillna"):
        for col in mikro_config["cols_rename"]:
          temp_stock_market[col] = temp_stock_market[col].interpolate(method="linear")

      parameters["data_preparation"]["features"] += mikro_config["cols_rename"]
      parameters["data_preparation"]["features"] = list(set(parameters["data_preparation"]["features"]))
      break

#### FINALIZE

In [None]:
# Update date
temp_stock_market.dropna(inplace=True)
parameters["stock_market_detail"]["start_date"] = str(np.min(temp_stock_market["Date"].dt.date))
parameters["stock_market_detail"]["end_date"] = str(np.max(temp_stock_market["Date"].dt.date))

In [None]:
stock_market = temp_stock_market.set_index("Date")

In [None]:
stock_market

## Preprocessing Data

In [None]:
stock_market

In [None]:
preprocessed_stock_market = stock_market.copy()

## A. 1-2-Library Analysis Sentiment

In [None]:
!pip install snscrape

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import random
import pandas as pd
import datetime

## A. 2-2- Scraping Data Twitter

In [None]:
#Don't Run Again!!!
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(6223)]
date_list_strptime = [str(i.strftime("%Y-%m-%d")) for i in date_list]
datetime_range = date_list_strptime[14:]
datetime_range

In [None]:
#Don't Run Again!!!
datetimes = []
for i in range(len(datetime_range)-1):
    # print(datetime_range[i], datetime_range[i+1])
    datetimes.append([datetime_range[i+1], datetime_range[i]])
dates = datetimes[::-1]

In [None]:
#Don't Run Again!!!
dates[::-1][5232:]

In [None]:
#Don't Run Again!!!
dates[-5][1]

In [None]:
#Don't Run Again!!!
dp = pd.read_csv('data.csv')
dp

In [None]:
#Don't Run Again!!!
import snscrape.modules.twitter as sntwitter
import pandas
import csv
import warnings
warnings.filterwarnings("ignore")

# Creating list to append tweet data to

# Using TwitterSearchScraper to scrape data and append tweets to list
for topic in ['resesi', 'perang', 'bahan bakar minyak']:
    for date in dates[::-1]:
        tweets_list2 = []
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{topic} since:{date[0]} until:{date[1]}').get_items()):
            if i>0:
                break
            print(f'{topic}--{tweet.date}')
            tweets_list2.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.url])
            
        # Creating a dataframe from the tweets list above
        tweets_df2 = pd.DataFrame(tweets_list2, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Url'])
        data = pd.read_csv('tweets_per_keyword.csv')
        datas = pd.concat([data, tweets_df2])
        datas.to_csv('tweets_per_keyword.csv', index=False)
    

In [None]:
#Don't Run Again!!!
dates[::-1][5550:]

In [None]:
#Don't Run Again!!!
tweets_list2 = []
for date in dates[::-1]:
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'perang since:{date[0]} until:{date[1]}').get_items()):
        if i>4:
            break
        tweets_list2.append(['perang',tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.url])
        print(f'{tweet.date}-{tweet.content}')
        
    # Creating a dataframe from the tweets list above
tweets_df2 = pd.DataFrame(tweets_list2, columns=['keyword','Datetime', 'Tweet Id', 'Text', 'Username', 'Url'])
# data = pd.read_csv('data_resesi.csv')
# datas = pd.concat([data, tweets_df2])
tweets_df2.to_csv(f'data_perang.csv', index=False)


In [None]:
#Don't Run Again!!!
tweets_list3 = []
for date in dates[::-1]:
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'perang bahan bakar minyak resesi since:{date[0]} until:{date[1]}').get_items()):
        if i>4:
            break
        tweets_list3.append(['perang, bahan bakar minyak, resesi',tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.url])
        print(f'{tweet.date}-{tweet.content}')
        
    # Creating a dataframe from the tweets list above
tweets_df3 = pd.DataFrame(tweets_list3, columns=['keyword','Datetime', 'Tweet Id', 'Text', 'Username', 'Url'])
# data = pd.read_csv('data_resesi.csv')
# datas = pd.concat([data, tweets_df2])
tweets_df3.to_csv(f'data_kumpulan_keyword.csv', index=False)


## A. 3-2-Text Preprocessing

### Import Data

####Data BBM

In [None]:
import pandas as pd
tweet_df_bbm = pd.read_csv("/content/drive/MyDrive/Datasets/Twitter/data_bbm.csv")
tweet_df_bbm.head(5)

####Data Resesi

In [None]:
import pandas as pd
tweet_df_resesi = pd.read_csv("/content/drive/MyDrive/Datasets/Twitter/data_resesi.csv")
tweet_df_resesi.head(5)

####Data Perang

In [None]:
import pandas as pd
tweet_df_perang = pd.read_csv("/content/drive/MyDrive/Datasets/Twitter/data_perang.csv")
tweet_df_perang.head(5)

### Preprocessing Data

In [None]:
!pip install Sastrawi

In [None]:
import pandas as pd
import numpy as np
import tweepy
import matplotlib.pyplot as plt
import networkx as nx
from networkx.readwrite import json_graph
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot, plot
init_notebook_mode(connected=True)

# Machine Learning imports
import nltk
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import string
import re
from string import punctuation
from collections import Counter

In [None]:
# helper function to clean tweets
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    tweet = re.sub(r'bbm', '', tweet)
    tweet = re.sub(r'bahan', '', tweet)
    tweet = re.sub(r'bakar', '', tweet)
    tweet = re.sub(r'minyak', '', tweet)
    tweet = re.sub(r'resesi', '', tweet)
    tweet = re.sub(r'perang', '', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    
    return tweet

tweet_df_bbm['Text']=tweet_df_bbm['Text'].astype(str)
tweet_df_bbm = tweet_df_bbm.drop_duplicates('Text')
tweet_df_resesi['Text']=tweet_df_bbm['Text'].astype(str)
tweet_df_resesi = tweet_df_bbm.drop_duplicates('Text')
tweet_df_perang['Text']=tweet_df_bbm['Text'].astype(str)
tweet_df_perang = tweet_df_bbm.drop_duplicates('Text')
# clean dataframe's text column
tweet_df_bbm['Text_Clean'] = tweet_df_bbm['Text'].apply(processTweet)
tweet_df_resesi['Text_Clean'] = tweet_df_resesi['Text'].apply(processTweet)
tweet_df_perang['Text_Clean'] = tweet_df_perang['Text'].apply(processTweet)

In [None]:
tweet_df_bbm.head(5)

In [None]:
tweet_df_resesi.head(5)

In [None]:
tweet_df_perang.head(5)

In [None]:
#stopwords baruu
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
list_stop_words = StopWordRemoverFactory().get_stop_words()
stemming = StemmerFactory().create_stemmer()
# tokenize helper function
def text_process(raw_text):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(raw_text) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in list_stop_words]

# -------------------------------------------

# tokenize message column and create a column for tokens
tweet_df_bbm['tokens'] = tweet_df_bbm['Text_Clean'].apply(text_process)
tweet_df_resesi['tokens'] = tweet_df_resesi['Text_Clean'].apply(text_process)
tweet_df_perang['tokens'] = tweet_df_perang['Text_Clean'].apply(text_process)

In [None]:
tweet_df_bbm.head(5)

In [None]:
tweet_df_resesi.head(5)

In [None]:
tweet_df_perang.head(5)

##  A. 4-2-Sentiment Scoring using indonesian-roberta-base-sentiment Model and EDA

# Sentiment Scoring

In [None]:
#Don't Run Again!!!
!pip install transformers

In [None]:
#Don't Run Again!!!
from transformers import pipeline

model = f"w11wo/indonesian-roberta-base-sentiment-classifier"

sentiment_task = pipeline("sentiment-analysis", model=model)
sentiment_task("Covid cases are increasing fast!")

In [None]:
#Don't Run Again!!!
from tqdm.notebook import tqdm

#### BBM

In [None]:
#Don't Run Again!!!
sent_results = {}
count = 0
for i, d in tqdm(tweet_df_bbm.iterrows(), total=len(tweet_df_bbm)):
    sent = sentiment_task(d["Text"])
    sent_results[d["Tweet Id"]] = sent
    count += 1
    # if count == 500:
    #     break

In [None]:
#Don't Run Again!!!
sent_df = pd.DataFrame(sent_results).T
sent_df["label"] = sent_df[0].apply(lambda x: x["label"])
sent_df["score"] = sent_df[0].apply(lambda x: x["score"])
sent_df = sent_df.merge(
    tweet_df_bbm.set_index("Tweet Id"), left_index=True, right_index=True
)

In [None]:
#Don't Run Again!!!
sent_df

In [None]:
#Don't Run Again!!!
sent_df.label.value_counts()

In [None]:
#Don't Run Again!!!
sent_df.groupby("label")["score"].plot(kind="hist", bins=50)
plt.legend()
plt.show()

In [None]:
#Don't Run Again!!!
sent_df["score_"] = sent_df["score"]

sent_df.loc[sent_df["label"] == "Negative", "score_"] = (
    sent_df.loc[sent_df["label"] == "Negative"]["score"] * -1
)

sent_df.loc[sent_df["label"] == "Neutral", "score_"] = 0

In [None]:
#Don't Run Again!!!
sent_df["score_"].plot(kind="hist", bins=50)

In [None]:
#Don't Run Again!!!
sent_df

In [None]:
sent_df = pd.read_csv('/content/drive/MyDrive/Datasets/Twitter/sent_df_bbm.csv')
sent_df['Date'].isnull().sum()
sent_df = sent_df.dropna(subset=['Date'])

In [None]:
sent_df["Date"] = pd.to_datetime(sent_df["Datetime"], utc=True)
sent_df["Date"] = sent_df["Date"].dt.date

sent_daily = sent_df.groupby("Date")["score_"].mean()

clx_df = preprocessed_stock_market
clx_df = clx_df.reset_index()
clx_df["Date"] = clx_df["Date"].dt.date
clx_df = clx_df.set_index("Date")

sent_and_stock_bbm = sent_daily.to_frame("Sentiment_BBM").merge(
    clx_df, left_index=True, right_index=True
)

ax = sent_and_stock_bbm["Sentiment_BBM"].plot(legend="Sentiment")
ax2 = ax.twinx()
sent_and_stock_bbm["Close"].plot(ax=ax2, color="orange", legend="Closing Price")
plt.show()

In [None]:
sent_df

In [None]:
sent_and_stock_bbm

#### Resesi

In [None]:
#Don't Run Again!!!
sent_results = {}
count = 0
for i, d in tqdm(tweet_df_resesi.iterrows(), total=len(tweet_df_resesi)):
    sent = sentiment_task(d["Text"])
    sent_results[d["Tweet Id"]] = sent
    count += 1
    # if count == 500:
    #     break

In [None]:
#Don't Run Again!!!
sent_df = pd.DataFrame(sent_results).T
sent_df["label"] = sent_df[0].apply(lambda x: x["label"])
sent_df["score"] = sent_df[0].apply(lambda x: x["score"])
sent_df = sent_df.merge(
    tweet_df_bbm.set_index("Tweet Id"), left_index=True, right_index=True
)

In [None]:
#Don't Run Again!!!
sent_df

In [None]:
#Don't Run Again!!!
sent_df.label.value_counts()

In [None]:
#Don't Run Again!!!
sent_df.groupby("label")["score"].plot(kind="hist", bins=50)
plt.legend()
plt.show()

In [None]:
#Don't Run Again!!!
sent_df["score_"] = sent_df["score"]

sent_df.loc[sent_df["label"] == "Negative", "score_"] = (
    sent_df.loc[sent_df["label"] == "Negative"]["score"] * -1
)

sent_df.loc[sent_df["label"] == "Neutral", "score_"] = 0

In [None]:
#Don't Run Again!!!
sent_df["score_"].plot(kind="hist", bins=50)

In [None]:
#Don't Run Again!!!
sent_df

In [None]:
sent_df = pd.read_csv('/content/drive/MyDrive/Datasets/Twitter/sent_df_resesi.csv')
sent_df['Date'].isnull().sum()
sent_df = sent_df.dropna(subset=['Date'])

In [None]:
sent_df["Date"] = pd.to_datetime(sent_df["Datetime"], utc=True)
sent_df["Date"] = sent_df["Date"].dt.date

sent_daily = sent_df.groupby("Date")["score_"].mean()

clx_df = sent_and_stock_bbm
clx_df = clx_df.reset_index()
# clx_df["Date"] = clx_df["Date"].dt.date
clx_df = clx_df.set_index("Date")

sent_and_stock_resesi = sent_daily.to_frame("Sentiment_Resesi").merge(
    clx_df, left_index=True, right_index=True
)

ax = sent_and_stock_resesi["Sentiment_Resesi"].plot(legend="Sentiment")
ax2 = ax.twinx()
sent_and_stock_resesi["Close"].plot(ax=ax2, color="orange", legend="Closing Price")
plt.show()

In [None]:
sent_df

In [None]:
sent_and_stock_resesi

#### Perang

In [None]:
#Don't Run Again!!!
sent_results = {}
count = 0
for i, d in tqdm(tweet_df_perang.iterrows(), total=len(tweet_df_perang)):
    sent = sentiment_task(d["Text"])
    sent_results[d["Tweet Id"]] = sent
    count += 1
    # if count == 500:
    #     break

In [None]:
#Don't Run Again!!!
sent_df = pd.DataFrame(sent_results).T
sent_df["label"] = sent_df[0].apply(lambda x: x["label"])
sent_df["score"] = sent_df[0].apply(lambda x: x["score"])
sent_df = sent_df.merge(
    tweet_df_bbm.set_index("Tweet Id"), left_index=True, right_index=True
)

In [None]:
#Don't Run Again!!!
sent_df

In [None]:
#Don't Run Again!!!
sent_df.label.value_counts()

In [None]:
#Don't Run Again!!!
sent_df.groupby("label")["score"].plot(kind="hist", bins=50)
plt.legend()
plt.show()

In [None]:
#Don't Run Again!!!
sent_df["score_"] = sent_df["score"]

sent_df.loc[sent_df["label"] == "Negative", "score_"] = (
    sent_df.loc[sent_df["label"] == "Negative"]["score"] * -1
)

sent_df.loc[sent_df["label"] == "Neutral", "score_"] = 0

In [None]:
#Don't Run Again!!!
sent_df["score_"].plot(kind="hist", bins=50)

In [None]:
#Don't Run Again!!!
sent_df

In [None]:
sent_df = pd.read_csv('/content/drive/MyDrive/Datasets/Twitter/sent_df_perang.csv')
sent_df['Date'].isnull().sum()
sent_df = sent_df.dropna(subset=['Date'])

In [None]:
sent_df["Date"] = pd.to_datetime(sent_df["Datetime"], utc=True)
sent_df["Date"] = sent_df["Date"].dt.date

sent_daily = sent_df.groupby("Date")["score_"].mean()

clx_df = sent_and_stock_resesi
clx_df = clx_df.reset_index()
# clx_df["Date"] = clx_df["Date"].dt.date
clx_df = clx_df.set_index("Date")

sent_and_stock_perang = sent_daily.to_frame("Sentiment_Perang").merge(
    clx_df, left_index=True, right_index=True
)

ax = sent_and_stock_perang["Sentiment_Perang"].plot(legend="Sentiment")
ax2 = ax.twinx()
sent_and_stock_perang["Close"].plot(ax=ax2, color="orange", legend="Closing Price")
plt.show()

In [None]:
sent_df

In [None]:
sent_and_stock_perang

In [None]:
sent_and_stock_perang.columns

In [None]:
sent_and_stock_perang = sent_and_stock_perang [['Open', 'High',
       'Low', 'Close', 'Inflasi', 'Kurs', 'M2', 'Vix Open', 'Vix High',
       'Vix Low', 'Vix Close', 'ROE', 'NPM', 'PER', 'Sentiment_Perang', 'Sentiment_Resesi', 'Sentiment_BBM',]]

In [None]:
sent_and_stock_perang.columns

In [None]:
sent_and_stock_perang.head(5)

In [None]:
#Don't Run Again!!!
sent_and_stock_perang.to_csv("/content/drive/MyDrive/Datasets/New Project/Dataset/final_dataset.csv")

### EDA using Word Cloud

In [None]:
import pandas as pd
tweet_df_bbm1 = pd.read_csv("/content/drive/MyDrive/Datasets/Twitter/sent_df_bbm.csv")
tweet_df_resesi1 = pd.read_csv("/content/drive/MyDrive/Datasets/Twitter/sent_df_resesi.csv")
tweet_df_perang1= pd.read_csv("/content/drive/MyDrive/Datasets/Twitter/sent_df_perang.csv")

In [None]:
tweet_df_perang1['tokens'] = tweet_df_perang['tokens']
tweet_df_bbm1['tokens'] = tweet_df_bbm['tokens']
tweet_df_resesi1['tokens'] = tweet_df_resesi['tokens']

In [None]:
tweet_df_bbm = tweet_df_bbm1.dropna(subset=['tokens'])
tweet_df_resesi = tweet_df_resesi1.dropna(subset=['tokens'])
tweet_df_perang = tweet_df_perang1.dropna(subset=['tokens'])

In [None]:
df2a= tweet_df_bbm[tweet_df_bbm['label']=='positive']
df2b= tweet_df_resesi[tweet_df_resesi['label']=='positive']
df2c= tweet_df_perang[tweet_df_perang['label']=='positive']

In [None]:
df1a= tweet_df_bbm[tweet_df_bbm['label']=='neutral']
df1b= tweet_df_resesi[tweet_df_resesi['label']=='neutral']
df1c= tweet_df_perang[tweet_df_perang['label']=='neutral']

In [None]:
df0a= tweet_df_bbm[tweet_df_bbm['label']=='negative']
df0b= tweet_df_resesi[tweet_df_resesi['label']=='negative']
df0c= tweet_df_perang[tweet_df_perang['label']=='negative']

#### Pie Chart Sentiment

##### BBM

In [None]:
tweet_df_bbm.label.value_counts()

In [None]:
tweet_df_bbm = tweet_df_bbm.loc[tweet_df_bbm['label']!='https://twitter.com/ZhafirahUmay/status/598518953749520384']

In [None]:
plot_size = plt.rcParams["figure.figsize"] 
print(plot_size[0]) 
print(plot_size[1])

plot_size[0] = 8
plot_size[1] = 6
plt.rcParams["figure.figsize"] = plot_size 

tweet_df_bbm.groupby('label').count()['Text'].plot(kind='pie', labels= ('negative', 'neutral', 'positive'), autopct='%1.0f%%')

##### Resesi

In [None]:
tweet_df_resesi.label.value_counts()

In [None]:
tweet_df_resesi = tweet_df_resesi.loc[tweet_df_resesi['label']!='https://twitter.com/ZhafirahUmay/status/598518953749520384']

In [None]:
plot_size = plt.rcParams["figure.figsize"] 
print(plot_size[0]) 
print(plot_size[1])

plot_size[0] = 8
plot_size[1] = 6
plt.rcParams["figure.figsize"] = plot_size 

tweet_df_resesi.groupby('label').count()['Text'].plot(kind='pie', labels= ('negative', 'neutral', 'positive'), autopct='%1.0f%%')

##### Perang

In [None]:
tweet_df_perang.label.value_counts()

In [None]:
tweet_df_perang = tweet_df_perang.loc[tweet_df_perang['label']!='https://twitter.com/ZhafirahUmay/status/598518953749520384']

In [None]:
plot_size = plt.rcParams["figure.figsize"] 
print(plot_size[0]) 
print(plot_size[1])

plot_size[0] = 8
plot_size[1] = 6
plt.rcParams["figure.figsize"] = plot_size 

tweet_df_perang.groupby('label').count()['Text'].plot(kind='pie', labels= ('negative', 'neutral', 'positive'), autopct='%1.0f%%')

#### WC Total

In [None]:
from collections import Counter

##### BBM

In [None]:
all_words = []
for line in tweet_df_bbm['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq = Counter(all_words)
wordfreq.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Resesi

In [None]:
all_words = []
for line in tweet_df_resesi['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq = Counter(all_words)
wordfreq.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Perang

In [None]:
all_words = []
for line in tweet_df_perang['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq = Counter(all_words)
wordfreq.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#### WC Positive

##### BBM

In [None]:
all_words = []
for line in df2a['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq1 = Counter(all_words)
wordfreq1.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq1)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Resesi

In [None]:
all_words = []
for line in df2b['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq1 = Counter(all_words)
wordfreq1.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq1)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Perang

In [None]:
all_words = []
for line in df2c['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq1 = Counter(all_words)
wordfreq1.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq1)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#### WC Neutral

##### BBM

In [None]:
all_words = []
for line in df1a['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq1 = Counter(all_words)
wordfreq1.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq1)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Resesi

In [None]:
all_words = []
for line in df1b['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq1 = Counter(all_words)
wordfreq1.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq1)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Perang

In [None]:
all_words = []
for line in df1c['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq1 = Counter(all_words)
wordfreq1.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq1)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#### WC Negative

##### BBM

In [None]:
all_words = []
for line in df0a['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq0 = Counter(all_words)
wordfreq0.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq0)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Resesi

In [None]:
all_words = []
for line in df0b['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq0 = Counter(all_words)
wordfreq0.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq0)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

##### Perang

In [None]:
all_words = []
for line in df0c['tokens']: 
    all_words.extend(line)  
# create a word frequency dictionary
wordfreq0 = Counter(all_words)
wordfreq0.most_common(10)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=900,
                      height=500,
                      max_words=500,
                      max_font_size=100,
                      relative_scaling=0.5,
                      colormap='gist_rainbow',
                      normalize_plurals=True).generate_from_frequencies(wordfreq0)
plt.figure(figsize=(17,14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()