# Twitter Sentiment Analysis - Logistic Regression 
## <div> Vassilis Panagakis </div>

In [1]:
import pandas as pd
import numpy as np 
import re
import warnings

warnings.filterwarnings('ignore')  

## Load Data

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#### Create a dataframe from SentimentTweets.csv file data

In [3]:
df=pd.read_csv('gdrive/My Drive/Colab Notebooks/SentimentTweets.csv', usecols=['target','id','date','flag','user','text'])

df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,2249621587,Fri Jun 19 22:41:08 PDT 2009,NO_QUERY,sukumarpant,#brokenpromises...
1,0,2059003515,Sat Jun 06 16:03:21 PDT 2009,NO_QUERY,MTMSparrow,David Carradine so sad. Thai's law not sure i...
2,4,2017466467,Wed Jun 03 08:26:14 PDT 2009,NO_QUERY,itsmemcee,A @ 415 B @ 425. Tell your bro i say congrats!
3,4,2186457254,Mon Jun 15 18:52:04 PDT 2009,NO_QUERY,jdfreivald,@littlefluffycat Indeed.
4,4,2064458395,Sun Jun 07 06:19:20 PDT 2009,NO_QUERY,CrazyHan,Completed Race 4 Life in 58mins with girlies f...


In [4]:
df.drop(columns=['id', 'date', 'flag', 'user'], axis=1, inplace=True) #drop useless columns

df.head()

Unnamed: 0,target,text
0,0,#brokenpromises...
1,0,David Carradine so sad. Thai's law not sure i...
2,4,A @ 415 B @ 425. Tell your bro i say congrats!
3,4,@littlefluffycat Indeed.
4,4,Completed Race 4 Life in 58mins with girlies f...


In [5]:
#split 'target' column from the rest of the data
X = df[['text']]
y = df[['target']]

In [6]:
from sklearn.model_selection import train_test_split

#get train and test dataframes 
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, stratify=df['target'], random_state = 42)

#### Display train and test sets after split

In [7]:
train_X.head()

Unnamed: 0,text
492660,The respected journalist @robfahey just tried ...
858754,"@maggienash Thanks, Maggie. I'm still bouncing"
746070,"@do0dlebugdebz not now,but will be if you're o..."
885531,@Pure798 yea that works...ughh
806384,@melapoo lol Cool. I keep getting the weird on...


In [8]:
test_X.head()

Unnamed: 0,text
220479,...... fuck you soderling. fuck you. this has ...
840623,OK next step - today I learn to twitter from m...
772674,trying to upload a picture and loves how twitt...
1037193,@addictedtonye Get 100 followers a day using w...
971656,@AmeliaBt It sure is..


## Labels Pre-processing

In [9]:
#replace 4 with 1 to create binary labels
train_Y = train_Y.replace(4,1)
test_Y = test_Y.replace(4,1)

In [10]:
#create numpy arrays for sets' labels
train_y = np.asarray(train_Y['target'].tolist()) 
test_y = np.asarray(test_Y['target'].tolist())

#### Display binary labels 

In [11]:
train_Y.head()

Unnamed: 0,target
492660,0
858754,1
746070,1
885531,0
806384,1


In [12]:
test_Y.head()

Unnamed: 0,target
220479,0
840623,1
772674,0
1037193,1
971656,0


## Data Pre-processing

#### Load cleansed data from csv


In [None]:
# restore processed data
train_X = pd.read_csv('gdrive/My Drive/Colab Notebooks/CleanedTrain.csv', usecols=['text','processedText'])
test_X = pd.read_csv('gdrive/My Drive/Colab Notebooks/CleanedTest.csv', usecols=['text','processedText'])

In [13]:
#function that removes all @mentions, links and non alphabetic strings 
def clean_content(text):
    
    text = re.sub(r'@[A-Za-z0-9_]+', '', text) #remove text with @ prefix
    text = re.sub(r'http\S+', '', text) #remove text with http prefix (links)  
    text = re.sub(r'www\S+', '', text) #remove text with www prefix (links)
    text = re.sub(r'\\\w+', '', str(text)) #remove text after backslash
    text = re.sub(r'\b\w{1,2}\b', '', text) #remove text containing 2 or less characters
    
    text =  ''.join(ch for ch in text if ch.isalpha() or ch == ' ')
    
    text = text.lower() #convert text into lowercase
    
    return text

In [14]:
#create a column for each set containing the processed text data
for index, row in train_X.iterrows():
    train_X.loc[index,'processedText'] = clean_content(train_X.loc[index,'text'])

for index, row in test_X.iterrows():
    test_X.loc[index,'processedText'] = clean_content(test_X.loc[index,'text'])

#### Display train and test sets after text cleansing

In [33]:
train_X.head()

Unnamed: 0,text,processedText
492660,The respected journalist @robfahey just tried ...,the respected journalist just tried have sex...
858754,"@maggienash Thanks, Maggie. I'm still bouncing",thanks maggie still bouncing
746070,"@do0dlebugdebz not now,but will be if you're o...",not nowbut will you just sounded like da...
885531,@Pure798 yea that works...ughh,yea that worksughh
806384,@melapoo lol Cool. I keep getting the weird on...,lol cool keep getting the weird ones got th...


In [34]:
test_X.head()

Unnamed: 0,text,processedText
220479,...... fuck you soderling. fuck you. this has ...,fuck you soderling fuck you this has night...
840623,OK next step - today I learn to twitter from m...,next step today learn twitter from phone ...
772674,trying to upload a picture and loves how twitt...,trying upload picture and loves how twitter ...
1037193,@addictedtonye Get 100 followers a day using w...,get followers day using once you add every...
971656,@AmeliaBt It sure is..,sure


In [None]:
# # store processed data to a csv file
# train_X.to_csv('gdrive/My Drive/Colab Notebooks/CleanedTrain.csv', index = True, header=True)
# test_X.to_csv('gdrive/My Drive/Colab Notebooks/CleanedTest.csv', index = True, header=True)

### Tf-idf Vectorization

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfVectorizer = TfidfVectorizer(max_df=0.99, min_df=1, max_features=850)

In [16]:
tfidf = tfidfVectorizer.fit_transform(train_X['processedText'])

vectors = []
for v in tfidf.toarray():
    vectors.append(v)

#create a column in train set with texts' words tf-idf counts transformation
train_X['tf-idf'] = pd.Series(vectors,index=train_X.index)

In [17]:
tfidf = tfidfVectorizer.transform(test_X['processedText'])

vectors = []
for v in tfidf.toarray():
    vectors.append(v)

#create a column in test set with texts' words tf-idf counts transformation    
test_X['tf-idf'] = pd.Series(vectors,index=test_X.index)

#### Insert 'tf-idf' columns to arrays for both training and test sets

In [18]:
train_tfidf = np.asarray(train_X['tf-idf'].tolist())
test_tfidf = np.asarray(test_X['tf-idf'].tolist())

### Evaluation Functions

In [23]:
#create a table to display the accuracy and F1-score of the Classifier 
lr_dic = {'LR':['-','-','-']}

lr_df = pd.DataFrame.from_dict(lr_dic, orient='index', columns=['Precision','Recall','F1-Score'])

In [24]:
md={}

#function that uses values from a dictionary to create a dataframe to store model evaluation values
def model_dataframe(classifier, avg_prec, prec_std, avg_rec, rec_std, avg_f1, f1_std):

    md['classifier'] = classifier
    md['avg-precision'] = avg_prec
    md['precision-std'] = prec_std
    md['avg-recall'] = avg_rec
    md['recall-std'] = rec_std
    md['avg-f1-score'] = avg_f1
    md['f1-score-std'] = f1_std
    
    df = pd.DataFrame(data=md) #create dataframe from dictionary data
    
    return df 

In [25]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

#define k-fold evaluation methods
numOfFolds = 10
cv1 = KFold(n_splits=numOfFolds, shuffle=True)
cv2 = StratifiedKFold(n_splits=numOfFolds, shuffle=True)

In [26]:
from sklearn import metrics

#scoring methods that calculate the precision, recall and f-mesure values for every fold of k-fold cross validation 

def precEval(y_true, y_pred):
    fold_report = metrics.classification_report(y_true, y_pred, output_dict=True)
    prec = fold_report['macro avg']['precision']
    return prec

def recEval(y_true, y_pred):
    fold_report = metrics.classification_report(y_true, y_pred, output_dict=True)
    rec = fold_report['macro avg']['recall']
    return rec

def f1Eval(y_true, y_pred):
    fold_report = metrics.classification_report(y_true, y_pred, output_dict=True)
    f1 = fold_report['macro avg']['f1-score']
    return f1

## Classification

In [27]:
#initialize empty lists to store column values of the dataframe
classifier, avg_prec, prec_std, avg_rec, rec_std, avg_f1, f1_std = ([] for i in range(7))

###  Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

classifier.append('Logistic Regression')

#Logistic Regression Classifier using word tfidf transformation features
lr = LogisticRegression(C=1, class_weight='balanced', solver='lbfgs', max_iter=20)

lr.fit(train_tfidf, train_y)
y_pred_lr = lr.predict(test_tfidf) #prediction on test set

lr_df.loc['LR','Precision'] = "%.3f%%" % (metrics.precision_score(test_y, y_pred_lr) * 100)
lr_df.loc['LR','Recall'] = "%.3f%%" % (metrics.recall_score(test_y, y_pred_lr) * 100)
lr_df.loc['LR','F1-Score'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_lr) * 100)

In [29]:
#stratified 10-fold Cross Validation on train set
prec = cross_val_score(lr, train_tfidf, train_y, cv=cv2, scoring=metrics.make_scorer(precEval))
rec = cross_val_score(lr, train_tfidf, train_y, cv=cv2, scoring=metrics.make_scorer(recEval))
f1 = cross_val_score(lr, train_tfidf, train_y, cv=cv2, scoring=metrics.make_scorer(f1Eval))

av_prec = "%.3f%%" % (prec.mean() * 100)
av_rec = "%.3f%%" % (rec.mean() * 100)
av_f1 = "%.3f%%" % (f1.mean() * 100)

avg_prec.append(av_prec), prec_std.append(prec.std())
avg_rec.append(av_rec), rec_std.append(rec.std())
avg_f1.append(av_f1), f1_std.append(f1.std())

(None, None)

## Model Evaluation

### Prediction on test set

In [30]:
lr_df

Unnamed: 0,Precision,Recall,F1-Score
LR,74.262%,77.093%,75.651%


### Stratified 10-fold Cross Validation on train set

In [31]:
mdf = model_dataframe(classifier, avg_prec, prec_std, avg_rec, rec_std, avg_f1, f1_std)
mdf

Unnamed: 0,classifier,avg-precision,precision-std,avg-recall,recall-std,avg-f1-score,f1-score-std
0,Logistic Regression,75.315%,0.00136,75.260%,0.001262,75.256%,0.001285
