In [2]:
%load_ext autoreload
%autoreload 2

Mounting google drive

In [3]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# change to working directory
!cd /content/drive/MyDrive/self_projects/Twitter-Sentiment-Analysis/code

In [18]:
ls

[0m[01;34mdatasets[0m/  main.ipynb  [01;34mmodel[0m/  [01;34moutput[0m/  sentiment-analysis.ipynb  [01;34mutils[0m/


### Preprocess Data

In [19]:
from utils.preprocess import TextPreprocessor

text_processor = TextPreprocessor('output/')
# preprocess train dataset
text_processor.preprocess_txt('datasets/train_tweet.csv', is_test=False)
# preprocess test dataset
text_processor.preprocess_txt('datasets/test_tweets.csv', is_test=True)

Reading datasets ...
Preprocessing datasets ...
Preprocess datasets saved.
Reading datasets ...
Preprocessing datasets ...
Preprocess datasets saved.


In [20]:
from utils.preprocess import StatsPreprocessor

In [21]:
stat_processor = StatsPreprocessor('output/')
# processing train data stats
stat_processor.generate_stats("output/train_tweet-processed.csv")
print("-"*20)
stat_processor.print_stats()

Reading datasets ...
Preprocessing datasets stats ...
Calculating frequency distribution
Saved uni-frequency distribution to:  output/freqdist.pkl
Saved bi-frequency distribution to:  output/freqdist-bi.pkl
--------------------
[Analysis Statistics]
Tweets => Total: 31962, Positive: 2242, Negative: 29677
User Mentions => Total: 17517, Avg: 0.5480570677679745, Max: 11
URLs => Total: 6, Avg: 0.00018772292096865028, Max: 1
Emojis => Total: 660, Positive: 473, Negative: 187, Avg: 0.02064952130655153, Max: 4
Words => Total: 373086, Unique: 36909, Avg: 11.672798948751643, Max: 32, Min: 0
Bigrams => Total: 341187, Unique: 171585, Avg: 10.674770039421814


Perfoming Sentiment Analysis using different models.

1. Using wordsets to classify tweets
  - We are using a set of positive and negative word sets to classify tweets.

In [22]:
import pandas as pd

In [23]:
# file paths
TRAIN_PROCESSED_FILE = 'output/train_tweet-processed.csv'
TEST_PROCESSED_FILE = 'output/test_tweets-processed.csv'
POSITIVE_WORDS_FILE = 'datasets/positive-words.txt'
NEGATIVE_WORDS_FILE = 'datasets/negative-words.txt'

In [24]:
from model.baseline import WordBasedClassifier

In [25]:
word_classifier = WordBasedClassifier(POSITIVE_WORDS_FILE, NEGATIVE_WORDS_FILE)
train_pred = word_classifier.predict(TRAIN_PROCESSED_FILE)
test_pred = word_classifier.predict(TEST_PROCESSED_FILE, is_test=True)


Correct predictions: 6229/31962
Training accuracy:  0.19488767911895374


In [26]:
test_data = pd.read_csv(TEST_PROCESSED_FILE)
test_data['predictions'] = test_pred[:17197]

In [27]:
test_data[['tweet', 'predictions']].head(5)

Unnamed: 0,tweet,predictions
0,studiolife aislife requires passion dedication...,0
1,USER_MENTION white supremacists want everyone ...,1
2,safe ways to heal your acne altwaystoheal heal...,1
3,is the hp and the cursed child book up for res...,1
4,bihday to my amazing hilarious nephew eli ahmi...,1


2. Neural Net Classifier

In [28]:
from model.neural_net import NeuralNetClassifier

In [39]:
nn_classifier = NeuralNetClassifier('output/freqdist.pkl', 'output/freqdist-bi.pkl', use_bigram=True)
nn_classifier.fit(TRAIN_PROCESSED_FILE)

Processing tweets ...
Generating features
Splitting data for train and val ...
Training | Epoch:  0
Iteration 0/449, loss:0.5173752903938293, acc:0.90625
Iteration 100/449, loss:0.23333147168159485, acc:0.90625
Iteration 200/449, loss:0.15316656231880188, acc:0.921875
Iteration 300/449, loss:0.08630434423685074, acc:0.96875
Iteration 400/449, loss:0.12781333923339844, acc:0.953125
Epoch: 1, val_acc:1.049498746867168
Accuracy improved from 0.0000 to 1.0495, saving model
Training | Epoch:  1
Iteration 0/449, loss:0.02460017427802086, acc:1.0
Iteration 100/449, loss:0.04541803151369095, acc:1.0
Iteration 200/449, loss:0.038176681846380234, acc:0.984375
Iteration 300/449, loss:0.0423038974404335, acc:0.984375
Iteration 400/449, loss:0.06064079701900482, acc:0.984375
Epoch: 2, val_acc:1.0516917293233083
Accuracy improved from 1.0495 to 1.0517, saving model


In [None]:
pd.read_csv(TRAIN_PROCESSED_FILE)['tweet'][0]