# Preprocessing Review Text

Section 1
- Loading dataset from csv into Pandas dataframe
- Concatenate dataset and save as a new CSV file

Section 2
- Retrieve all dates based on Change (%)
- Saving cleaned version

Section 3
- Lemmatizing words of data from above section
- Saving lemmatized version

Section 4
- Lemmatizing words of data from above section
- Saving lemmatized version

Section 5
- Stemming words of data from above section
- Saving stemmed version

Example of Lemmatized Word vs Stemmed Word
- Exploring the effect on example word "purchase"

In [21]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
from nltk.stem.porter import PorterStemmer
from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('vader_lexicon')


### Section 1 

Loading all the tweets and saving them as a Dataframe object.

In [22]:
#df = pd.concat(map(pd.read_csv, ['d1.csv', 'd2.csv','d3.csv']))
data = pd.read_csv("bitcoin_2020_tweets.csv",encoding="ISO-8859-1")


In [23]:
data.tail()

Unnamed: 0.1,Unnamed: 0,Datetime,Text
1499996,1499996,2020-04-08 12:19:01+00:00,BitCoin to have increase in hashrate post BCH ...
1499997,1499997,2020-04-08 12:18:58+00:00,#Bitcoin Futures Did Not âManipulateâ #BTC...
1499998,1499998,2020-04-08 12:18:53+00:00,Use #cryptocurrencies #bitcoin #BTC #Ethereum ...
1499999,1499999,2020-04-08 12:17:36+00:00,P.s #buy #btc. ...
1500000,1500000,2020-04-08 12:17:10+00:00,@ReuScherf @CobraBitcoin Also #Bitcoin is a he...


### Section 2

Selecting only the column Datetime and Text and transform Datetime Column to Date format

In [24]:
data = data [["Datetime", "Text"]]
data["Datetime"] = pd.to_datetime(data["Datetime"]).dt.date
data.drop_duplicates()
data.head()

Unnamed: 0,Datetime,Text
0,2020-12-29,@manwnomelanin @TheCrypt0Mask @NeerajKA Bitcoi...
1,2020-12-29,i think next stop is 31k\n\n$btc #btc #bitcoin
2,2020-12-29,altlar iÃ§in dikkatli olmakta fayda var btc ra...
3,2020-12-29,@PeterSchiff The government never took action ...
4,2020-12-29,$30k by Dec 30th. #Bitcoin #BTC ð ð


### Section 3
Pre-Processing of Tweets

- Function to pre-process tweets
- Function to handle Emoji
- Function to pre-process words
- Function to check if it is a valid word
- Function to check if tweets is english [Ongoing]
- Checking Stop words and Stemming in Main

In [25]:
stop_words = stopwords.words('english')
porter = PorterStemmer()

def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet



def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'UM', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)

    #if TextBlob(tweet).detect_language != "en":
   # if detect(tweet) != "en":
    #    tweet = ""
    
    #if tweet length less than 3, to be removed
    tweet2 = tweet.split()
    if len(tweet2) < 3:
        tweet = ""

    word_tokens = word_tokenize(tweet)
   
    for word in word_tokens:
        word = preprocess_word(word)
        if is_valid_word(word):
            if not word in stop_words:
                word = str(porter.stem(word))
                processed_tweet.append(word)

    return ' '.join(processed_tweet) 

In [26]:
def check_language (tweet):
    if tweet != "":
        if detect(tweet) != "en":
            tweet = ""
    return tweet

In [27]:
data["processed_tweets"] = np.vectorize(preprocess_tweet)(data["Text"])
data.head()

Unnamed: 0,Datetime,Text,processed_tweets
0,2020-12-29,@manwnomelanin @TheCrypt0Mask @NeerajKA Bitcoi...,um um um bitcoin max coverag span internet acc...
1,2020-12-29,i think next stop is 31k\n\n$btc #btc #bitcoin,think next stop btc btc bitcoin
2,2020-12-29,altlar iÃ§in dikkatli olmakta fayda var btc ra...,altlar dikkatli olmakta fayda var btc rahat ye...
3,2020-12-29,@PeterSchiff The government never took action ...,um govern never took action gold guess gold su...
4,2020-12-29,$30k by Dec 30th. #Bitcoin #BTC ð ð,dec bitcoin btc


In [28]:
data["clean_tweets"] = np.vectorize(check_language)(data["processed_tweets"])

 # Section 4
Sentiment Analysis and Retrieving Average Compound Score per day

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid_obj = SentimentIntensityAnalyzer()

def sentiment_scores(sentence):
    sentiment_dict = sid_obj.polarity_scores(sentence)
    return sentiment_dict['compound']


In [None]:
#data = pd.read_csv("bitcoin_tweet_clean_2021(05-12).csv")
data = data.drop_duplicates()
data = data[data.clean_tweets != ""]
data = data.dropna(subset=["clean_tweets"])
data.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Text,processed_tweets,clean_tweets
3,3,2021-12-29,Current Bitcoin transaction fees: \n \nBCH ...,current bitcoin transact fee bch next block bc...,current bitcoin transact fee bch next block bc...
5,5,2021-12-29,Ether ($ETH) Beats Bitcoin ($BTC) in 2021 as V...,ether eth beat bitcoin btc volatil take bite b...,ether eth beat bitcoin btc volatil take bite b...
6,6,2021-12-29,Current #Bitcoin Price is $46445 #BTC #Crypto,current bitcoin price btc crypto,current bitcoin price btc crypto
9,9,2021-12-29,"BTC Latest Block Info: Block 716330 holds 2,86...",btc latest block info block hold transact tota...,btc latest block info block hold transact tota...
10,10,2021-12-29,BTCæå¤ã¾ã§ããå°ãð¤\n\nãªã¹ã¯ã...,bitcoin btc url,bitcoin btc url


In [None]:
data["Compound_score"] = np.vectorize(sentiment_scores)(data["clean_tweets"])

data.head()
data.drop(["Text", "processed_tweets","clean_tweets"], axis= 1)
grouped_df = data.groupby("Datetime")
mean_df = grouped_df.mean()
mean_df = mean_df.reset_index()
print(mean_df)


       Datetime    Unnamed: 0  Compound_score
0    2021-05-24  1.997544e+06        0.097977
1    2021-05-25  1.990162e+06        0.083451
2    2021-05-26  1.980309e+06        0.068418
3    2021-05-27  1.971212e+06        0.085017
4    2021-05-28  1.962057e+06        0.055294
..          ...           ...             ...
215  2021-12-25  3.402677e+04        0.078180
216  2021-12-26  2.805427e+04        0.062544
217  2021-12-27  2.125931e+04        0.103359
218  2021-12-28  1.306753e+04        0.053668
219  2021-12-29  4.269872e+03        0.060327

[220 rows x 3 columns]


In [None]:
data.to_csv("bitcoin_2020(056-12)CS.csv")

Confusion Matrix Tabulation

In [None]:
# counting for price hike days

falseneg = []
truepos = []
neu = []
def count_sentiment_hike(model, doc):
    ps = model.polarity_scores(doc)
    c = ps['compound']
    
    if c < 0:
        falseneg.append(1)
    elif c > 0:
        truepos.append(1)
    else:
        neu.append(1)
    
    result_FN = sum(falseneg)
    result_TP = sum(truepos)

    print(result_FN, result_TP)

    for i in range(len(crypto_tokens)):
    count_sentiment_hike(model," ".join(crypto_tokens[i]))

In [None]:
# counting for price dip days 

trueneg = []
falsepos = []
neu = []

def count_sentiment_dip(model, doc):
    ps = model.polarity_scores(doc)
    c = ps['compound']
    
    if c < 0:
        trueneg.append(1)
    elif c > 0:
        falsepos.append(1)
    else:
        neu.append(1)
        
    result_TN = sum(trueneg)
    result_FP = sum(falsepos)
        
    print(result_TN, result_FP)

    for i in range(len(crypto_tokens)):
    count_sentiment_dip(model," ".join(crypto_tokens[i]))

In [None]:
# counting for price rise days

falseneg = []
truepos = []
neu = []
def count_sentiment_rise(model, doc):
    ps = model.polarity_scores(doc)
    c = ps['compound']
    
    if c < 0:
        falseneg.append(1)
    elif c > 0:
        truepos.append(1)
    else:
        neu.append(1)
    
    result_FN = sum(falseneg)
    result_TP = sum(truepos)

    print(result_FN, result_TP)

    for i in range(len(crypto_tokens)):
    count_sentiment_rise(model," ".join(crypto_tokens[i]))

In [None]:
# counting for price drop days

trueneg = []
falsepos = []
neu = []

def count_sentiment_drop(model, doc):
    ps = model.polarity_scores(doc)
    c = ps['compound']
    
    if c < 0:
        trueneg.append(1)
    elif c > 0:
        falsepos.append(1)
    else:
        neu.append(1)
        
    result_TN = sum(trueneg)
    result_FP = sum(falsepos)
        
    print(result_TN, result_FP)

    for i in range(len(crypto_tokens)):
    count_sentiment_drop(model," ".join(crypto_tokens[i]))