In [1]:
import sqlite3
import pandas as pd

from nltk.corpus import stopwords
from autocorrect import Speller
import string
import re

## Text cleaning section

In [2]:
stopword_exclusion = ['no','not']
#excluded because they have meaning in sentiment analysis

en_sw = [word for word in stopwords.words('english') if word not in stopword_exclusion]

In [None]:
'''
Text cleaning function (complete version,
for reference only in this project)
'''

#set spacy lemmatizer
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def preprocessText(stringTxt):
    
    stringTxt = re.sub(r'http\S+', '', stringTxt) #remove links
    stringTxt = re.sub(r'(?<=@)\w+', '', stringTxt) #remove mentions
    stringTxt = re.sub(r'(?<=#)\w+', '', stringTxt) #remove hashtag
    stringTxt = stringTxt.replace('\n', ' ') #remove newline escape sequence
    stringTxt = stringTxt.lower() #convert all char to lowercase
    stringTxt = re.sub(r'\d+', '', stringTxt) #remove numbers
    
    #-----------------------------------------------------------------------------------------

    '''
    Remove emoji/emoticons and kaomoji
    '''
    printable = set(string.printable)
    stringTxt = ''.join(filter(lambda x: x in printable, stringTxt))
    #------------------------------------------------------------------------------------------
    
    #remove puntuations
    stringTxt = stringTxt.translate(str.maketrans('','',string.punctuation))
    
    #remove whitespaces
    stringTxt = stringTxt.strip()
    
    #------------------------------------------------------------------------------------------
    '''
    Spelling autocorrect section
    '''
    spell = Speller('en')
    stringTxt = ' '.join(spell(word) for word in stringTxt.split())
    
    #------------------------------------------------------------------------------------------
    
    #words that should never be changed
    #stringTxt = stringTxt.replace('gamespot','gamestop')
    stringTxt = stringTxt.replace('game stop','gamestop')
    stringTxt = stringTxt.replace('games tonk', 'gamestonk')
    
    #remove en stopwords
    stringTxt = ' '.join([word for word in stringTxt.split() if word not in en_sw])
    
    #Lemmatizer
    doc = nlp(stringTxt)
    stringTxt = ' '.join([token.lemma_ for token in doc])
    
    #words that should never be changed
    stringTxt = stringTxt.replace('web', 'wsb')
    stringTxt = stringTxt.replace('wallstreetbet', 'wallstreetbets')
    
    #remove extra space after numbers & special chars gone
    stringTxt = re.sub(r' +',' ', stringTxt)
       
    return stringTxt

In [3]:
def preprocessText_for_TextBlob(stringTxt):
    '''
    This is the one being used in this project,
    somehow TextBlob works well without stopword removal
    and stemming/lemmatizing. (CMIIW, please)
    
    The text used in corpus matrix and wordcloud will use different
    text cleaning function which is located in notebook no. 3.
    '''
    
    stringTxt = re.sub(r'http\S+', '', stringTxt) #remove links
    stringTxt = re.sub(r'(?<=@)\w+', '', stringTxt) #remove mentions
    stringTxt = re.sub(r'(?<=#)\w+', '', stringTxt) #remove hashtag
    stringTxt = stringTxt.replace('\n', ' ') #remove newline escape sequence
    stringTxt = stringTxt.lower() #convert all char to lowercase
    stringTxt = re.sub(r'\d+', '', stringTxt) #remove numbers
    
    #-----------------------------------------------------------------------------------------

    '''
    Remove emoji/emoticons and kaomoji
    '''
    printable = set(string.printable)
    stringTxt = ''.join(filter(lambda x: x in printable, stringTxt))
    #------------------------------------------------------------------------------------------
    
    #remove punctuations
    stringTxt = stringTxt.translate(str.maketrans('','',string.punctuation))
    
    #remove whitespaces
    stringTxt = stringTxt.strip()
    
    #------------------------------------------------------------------------------------------
    '''
    Spelling autocorrect section
    '''
    spell = Speller('en')
    stringTxt = ' '.join(spell(word) for word in stringTxt.split())
    
    #------------------------------------------------------------------------------------------
    
    #words that should never be changed
    #stringTxt = stringTxt.replace('gamespot','gamestop')
    stringTxt = stringTxt.replace('game stop','gamestop')
    stringTxt = stringTxt.replace('games tonk', 'gamestonk')
    stringTxt = stringTxt.replace('web', 'wsb') #there is no 'web' word in the collected sample pool
    stringTxt = stringTxt.replace('wallstreetbet', 'wallstreetbets')
       
    #No Stopword Removal
    
    #No Lemmatizer
      
    #remove extra space after numbers & special chars gone
    stringTxt = re.sub(r' +',' ', stringTxt)
       
    return stringTxt

## Tweet data retrieval from DB

In [4]:
'''
Read database content for tweet data
'''

connection = sqlite3.connect('tweet_db_gamestonk_final2.db')
cursor = connection.cursor()

query = '''SELECT Tweet.tweetid, Tweet.createddate, User.screenname, User.verified, Tweet.tweet
FROM Tweet INNER JOIN User
ON Tweet.userid = User.userid;'''

cursor.execute(query)
query_output = cursor.fetchall()

cursor.close()
connection.close()

In [5]:
outputDF = pd.DataFrame(query_output, columns=['tweetid', 'createddate', 'screenname', 'verified', 'tweet'])

In [6]:
outputDF

Unnamed: 0,tweetid,createddate,screenname,verified,tweet
0,1354177670364459008,26-01-2021,LA1986,0,GAMESTONK #GameStonk #GameStop #GME
1,1354179313634070528,26-01-2021,NaveNage,0,#GameStonk #GME 🤑 https://t.co/xmpiXN4jvc
2,1354180173147594753,26-01-2021,GuillaumeKMG,0,#wallstreetbets sub has grown 40% since Januar...
3,1354185684244963328,26-01-2021,Xcllusivefire,0,"⁦@elonmusk⁩ GameStonk tshirt, Stonk investing ..."
4,1354189015868649475,26-01-2021,ReallyRealNOW,0,@elonmusk Give me that #gamestonk shirt! #gme ...
...,...,...,...,...,...
3002,1356386944947687424,01-02-2021,SheepofWSB,0,The Volkswagen squeeze had a major dip before ...
3003,1356386964681883648,01-02-2021,stirlospace,0,#Training #TRAIN #wreck #gamedev #GameStop #ga...
3004,1356389069740273664,01-02-2021,realStevoDaniel,0,Meme stocks right now 🌝 .... #gamestonk #memes...
3005,1356390619921440771,01-02-2021,PeaceKeeper500,0,To all my paper handers out there $AMC $GME #h...


## Clean the tweet text using the text cleaning function created earlier

In [7]:
cleaned_tweet = [preprocessText_for_TextBlob(tweet) for tweet in outputDF['tweet'].tolist()]

In [8]:
outputDF['cleantweet'] = cleaned_tweet

In [9]:
outputDF.to_csv('clean_textblob_ver.csv', index=False) #create backup

## Store the cleaned tweet text into the DB

In [10]:
connection = sqlite3.connect('tweet_db_gamestonk_final2.db')
cursor = connection.cursor()

for i in range(len(cleaned_tweet)):
    query = '''UPDATE Tweet SET cleantweet = ? WHERE tweetid = ?;'''
    cursor.execute(query, (cleaned_tweet[i], outputDF['tweetid'].tolist()[i]))

connection.commit()

cursor.close()
connection.close()