# Q2

In [38]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix
import nltk
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [91]:
data = pd.read_csv("Q2 Sentiment Analysis Dataset.csv",encoding='latin1')
data.head()

Unnamed: 0,id,sentiment,date,text,Unnamed: 4,Unnamed: 5
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,,
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...,,
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,,
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on...",,
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...,,


In [92]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3886 entries, 0 to 3885
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          3886 non-null   int64  
 1   sentiment   3886 non-null   object 
 2   date        3886 non-null   object 
 3   text        3886 non-null   object 
 4   Unnamed: 4  0 non-null      float64
 5   Unnamed: 5  0 non-null      float64
dtypes: float64(2), int64(1), object(3)
memory usage: 182.3+ KB


In [93]:
df = data.drop(columns=['id','date','Unnamed: 4', 'Unnamed: 5'])
df.head()

Unnamed: 0,sentiment,text
0,1,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
1,1,@apple Contact sync between Yosemite and iOS8 ...
2,1,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...
3,1,"@Apple, For the love of GAWD, CENTER the '1'on..."
4,1,i get the storage almost full notification lit...


In [94]:
df['sentiment'] = df['sentiment'].replace({'1': 'negative', '3': 'neutral', '5': 'positive'})
df.sentiment.value_counts().head()

neutral         2162
negative        1219
positive         423
not_relevant      82
Name: sentiment, dtype: int64

In [82]:
df.head()

Unnamed: 0,sentiment,text
0,negative,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
1,negative,@apple Contact sync between Yosemite and iOS8 ...
2,negative,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...
3,negative,"@Apple, For the love of GAWD, CENTER the '1'on..."
4,negative,i get the storage almost full notification lit...


In [95]:
def preprocess_text(text):
    text = text.str.replace(r'[\(\[].*?[\)\]]', '')
    text = text.str.replace('\n', ' ')
    text = text.str.lower()
    text = text.str.replace(r'\d+', '')
    text = text.str.replace(r'[@#]', '')
    text = text.str.replace('[{}]'.format(string.punctuation), '')

    clean_text = []
    ignore = set(stopwords.words('english'))

    for i in text:
        words = nltk.word_tokenize(i)
        words = [word for word in words if word not in ignore and len(word) > 1]
        res_text = " ".join(words)
        clean_text.append(res_text)

    return clean_text

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

In [96]:
clean_df=pd.DataFrame(columns=['old_text','text'])
clean_df['old_text'] = df['text']
clean_df['text'] = preprocess_text(df['text'])
clean_df['text_lemmatized'] = clean_df['text'].apply(lemmatize_text)
clean_df['text_separated'] = clean_df['text_lemmatized'].apply(str)
clean_df['sentiment'] = df['sentiment']

clean_df.head()

  text = text.str.replace(r'[\(\[].*?[\)\]]', '')
  text = text.str.replace(r'\d+', '')
  text = text.str.replace(r'[@#]', '')
  text = text.str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,old_text,text,text_lemmatized,text_separated,sentiment
0,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,wtf battery one second ago wtf apple,wtf battery one second ago wtf apple,wtf battery one second ago wtf apple,negative
1,@apple Contact sync between Yosemite and iOS8 ...,apple contact sync yosemite ios seriously scre...,apple contact sync yosemite io seriously screw...,apple contact sync yosemite io seriously screw...,negative
2,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,warning buy iphone unlocked apple iphone use v...,warning buy iphone unlocked apple iphone use v...,warning buy iphone unlocked apple iphone use v...,negative
3,"@Apple, For the love of GAWD, CENTER the '1'on...",apple love gawd center damn calendar app fixed...,apple love gawd center damn calendar app fixed...,apple love gawd center damn calendar app fixed...,negative
4,i get the storage almost full notification lit...,get storage almost full notification literally...,get storage almost full notification literally...,get storage almost full notification literally...,negative


In [84]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3886 entries, 0 to 3885
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   old_text         3886 non-null   object
 1   text             3886 non-null   object
 2   text_lemmatized  3886 non-null   object
 3   text_separated   3886 non-null   object
 4   sentiment        3886 non-null   object
dtypes: object(5)
memory usage: 151.9+ KB


In [85]:
df = pd.DataFrame(columns=['word','Sentiment'])
df.word = clean_df.text_separated.apply(str)
df.Sentiment = clean_df.sentiment
df.head()

Unnamed: 0,word,Sentiment
0,wtf battery one second ago wtf apple,negative
1,apple contact sync yosemite io seriously screw...,negative
2,warning buy iphone unlocked apple iphone use v...,negative
3,apple love gawd center damn calendar app fixed...,negative
4,get storage almost full notification literally...,negative


In [86]:
word_series=pd.DataFrame()
label_encoder = LabelEncoder()
word_series['Word'] = df.word
word_series['sentiment'] = label_encoder.fit_transform(df['Sentiment'])
sentiments = word_series['sentiment'].unique()
word_series.sentiment.value_counts()

1    2162
0    1219
3     423
2      82
Name: sentiment, dtype: int64

Spliting the data

In [87]:
X = df.word
y = word_series.sentiment
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (3108,) (3108,)
Testing set shape: (778,) (778,)


In [89]:
# 1. Bag of words ( BOW )
count_vectorizer = CountVectorizer()
X_train_count_vectorizer = count_vectorizer.fit_transform(X_train)
X_test_count_vectorizer = count_vectorizer.transform(X_test)

# 2. Bag of words - TFIDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 3. N-Grams
ngram_range = (1, 3)
count_vectorizer = CountVectorizer( ngram_range=ngram_range)
X_train_ngram =count_vectorizer.fit_transform(X_train)
X_test_ngram = count_vectorizer.transform(X_test)

In [90]:
# Classifiers
def get_logisticRegression(X_train, X_test, y_train, y_test):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    return classification_report(predictions, y_test)

def get_naiveBayes(X_train, X_test, y_train, y_test):
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    return (classification_report(predictions, y_test))

def get_randomForest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    return classification_report(predictions, y_test)

def get_svm(X_train, X_test, y_train, y_test):
    svm = SVC()
    svm.fit(X_train, y_train)
    predictions = svm.predict(X_test)
    return classification_report(predictions, y_test)

def get_perceptron(X_train, X_test, y_train, y_test):
    perceptron = Perceptron()
    perceptron.fit(X_train, y_train)
    predictions = perceptron.predict(X_test)
    return classification_report(predictions, y_test)

def classify(method,X_train,X_test,y_train,y_test):
    print("Logistic Regression for ",method,"\n",get_logisticRegression(X_train, X_test, y_train, y_test))
    print("Random Forest for ",method,"\n",get_randomForest(X_train, X_test, y_train, y_test))
    print("Naive Bayes for", method,"\n",get_naiveBayes(X_train, X_test, y_train, y_test))
    print("SVM for ",method,"\n",get_svm(X_train, X_test, y_train, y_test))
    print("Perceptron for ", method,"\n",get_perceptron(X_train, X_test, y_train, y_test))

In [32]:
classify("Bag of words",X_train_count_vectorizer,X_test_count_vectorizer,y_train,y_test)

Logistic Regression for  Bag of words 
               precision    recall  f1-score   support

           0       0.66      0.76      0.71       211
           1       0.89      0.73      0.80       523
           2       0.00      0.00      0.00         2
           3       0.25      0.50      0.34        42

    accuracy                           0.73       778
   macro avg       0.45      0.50      0.46       778
weighted avg       0.79      0.73      0.75       778

Random Forest for  Bag of words 
               precision    recall  f1-score   support

           0       0.60      0.78      0.68       190
           1       0.91      0.72      0.80       549
           2       0.00      0.00      0.00         1
           3       0.28      0.61      0.38        38

    accuracy                           0.72       778
   macro avg       0.45      0.53      0.47       778
weighted avg       0.80      0.72      0.75       778

Naive Bayes for Bag of words 
               precision  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
classify("Bag of words with TFIDF",X_train_tfidf,X_test_tfidf,y_train,y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression for  Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.62      0.71      0.67       214
           1       0.90      0.72      0.80       541
           2       0.00      0.00      0.00         0
           3       0.17      0.61      0.26        23

    accuracy                           0.71       778
   macro avg       0.42      0.51      0.43       778
weighted avg       0.80      0.71      0.74       778

Random Forest for  Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.62      0.77      0.69       200
           1       0.91      0.72      0.81       544
           2       0.00      0.00      0.00         1
           3       0.28      0.70      0.40        33

    accuracy                           0.73       778
   macro avg       0.45      0.55      0.47       778
weighted avg       0.81      0.73      0.76       778

Naive Bayes for Bag of words with 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM for  Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.60      0.75      0.66       195
           1       0.92      0.70      0.80       565
           2       0.00      0.00      0.00         0
           3       0.14      0.67      0.24        18

    accuracy                           0.71       778
   macro avg       0.41      0.53      0.42       778
weighted avg       0.82      0.71      0.75       778

Perceptron for  Bag of words with TFIDF 
               precision    recall  f1-score   support

           0       0.64      0.75      0.69       207
           1       0.78      0.73      0.75       457
           2       0.22      0.29      0.25        14
           3       0.37      0.31      0.34       100

    accuracy                           0.68       778
   macro avg       0.50      0.52      0.51       778
weighted avg       0.68      0.68      0.67       778



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
classify("N-Grams",X_train_ngram,X_test_ngram,y_train,y_test)

Logistic Regression for  Ngrams 
               precision    recall  f1-score   support

           0       0.63      0.77      0.69       199
           1       0.90      0.71      0.79       547
           2       0.00      0.00      0.00         2
           3       0.20      0.57      0.30        30

    accuracy                           0.72       778
   macro avg       0.43      0.51      0.45       778
weighted avg       0.80      0.72      0.75       778

Random Forest for  Ngrams 
               precision    recall  f1-score   support

           0       0.55      0.82      0.66       163
           1       0.93      0.69      0.79       585
           2       0.00      0.00      0.00         2
           3       0.22      0.64      0.32        28

    accuracy                           0.71       778
   macro avg       0.42      0.54      0.44       778
weighted avg       0.82      0.71      0.75       778

Naive Bayes for Ngrams 
               precision    recall  f1-score

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
