In [1]:
import numpy as np
import pandas as pd
import datetime, pytz
import matplotlib.pyplot as plt 
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm
from sklearn import preprocessing
from textblob import TextBlob
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import re

### Text preprocessing and dataframe preparation

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
stop_words = nltk.corpus.stopwords.words(['english'])

lem = WordNetLemmatizer()

def cleaning(data):
    tweet_without_url = re.sub(r'http\S+',' ', data)
    tweet_without_hashtag = re.sub(r'#\w+', ' ', tweet_without_url)
    tweet_without_mentions = re.sub(r'@\w+',' ', tweet_without_hashtag)
    precleaned_tweet = re.sub('[^A-Za-z]+', ' ', tweet_without_mentions)
    tweet_tokens = TweetTokenizer().tokenize(precleaned_tweet)
    tokens_without_punc = [w for w in tweet_tokens if w.isalpha()]
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]
    return " ".join(text_cleaned)

def getSubjectivity(tweet):
    return TextBlob(tweet).sentiment.subjectivity

def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

def getSentiment(score):
    if score < 0:
        return 'negative'
    elif score == 0:
        return 'neutral'
    else:
        return 'positive'

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\timvu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timvu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\timvu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\timvu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
tweet_data = pd.read_csv('./data/tweets_sentiment_2018-2022.csv')
tweet_data.head()

Unnamed: 0,timestamp,user,replies,likes,retweets,text,polarity,influence,score
0,2022-05-20 22:08:42+00:00,lumberhawk,1,0,0,I need to get better at slow-pilling. I feel l...,0.6705,6e-06,4e-06
1,2022-05-20 22:08:23+00:00,takethatcdc,0,1,0,Elon Musk dazzles world with plan to manufactu...,0.0,6e-06,0.0
2,2022-05-20 22:08:21+00:00,dbonatoliv,0,1,0,"@BitcoinMagazine Its fine, we don't need more ...",0.3607,6e-06,2e-06
3,2022-05-20 22:08:20+00:00,theincomeblog,0,1,0,Bitmain Antminer APW7 PSU 1800W Power Supply f...,0.0,6e-06,0.0
4,2022-05-20 22:07:55+00:00,doctoryev,0,1,0,"""Web3"" is uptrending the last 2 months. Other...",0.6705,6e-06,4e-06


In [5]:
df = tweet_data.copy()
df = df[['timestamp', 'text', 'polarity']]
df['cleaned_text'] = df['text'].apply(cleaning)
df['timestamp'] = pd.to_datetime(tweet_data['timestamp'])
df.head()

# df['subjectivity'] = df['cleaned_tweets'].apply(getSubjectivity)
# df['polarity'] = df['cleaned_tweets'].apply(getPolarity)
# df['sentiment'] = df['polarity'].apply(getSentiment)
# df.head()

Unnamed: 0,timestamp,text,polarity,cleaned_text
0,2022-05-20 22:08:42+00:00,I need to get better at slow-pilling. I feel l...,0.6705,I need get better slow pilling I feel like I c...
1,2022-05-20 22:08:23+00:00,Elon Musk dazzles world with plan to manufactu...,0.0,Elon Musk dazzle world plan manufacture infant...
2,2022-05-20 22:08:21+00:00,"@BitcoinMagazine Its fine, we don't need more ...",0.3607,Its fine need whale But surprise respect prope...
3,2022-05-20 22:08:20+00:00,Bitmain Antminer APW7 PSU 1800W Power Supply f...,0.0,Bitmain Antminer APW PSU W Power Supply Bitcoi...
4,2022-05-20 22:07:55+00:00,"""Web3"" is uptrending the last 2 months. Other...",0.6705,Web uptrending last month Other terminology de...


In [7]:
df.to_csv('./data/tweets_sentiment_train_2018-2022.csv')