In [None]:
!pip install unidecode contractions --quiet
!pip install emot --upgrade --quiet
!pip install vaderSentiment --upgrade --quiet
!pip install textblob --upgrade --quiet

[K     |████████████████████████████████| 235 kB 5.2 MB/s 
[K     |████████████████████████████████| 321 kB 51.5 MB/s 
[K     |████████████████████████████████| 284 kB 66.8 MB/s 
[?25h  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 61 kB 19 kB/s 
[K     |████████████████████████████████| 125 kB 5.2 MB/s 
[K     |████████████████████████████████| 636 kB 4.8 MB/s 
[?25h

In [None]:
import os
import re
import json
import string
import math
from tqdm import tqdm
import emot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import nltk
import spacy
import unidecode 
from bs4 import BeautifulSoup
import contractions
import pickle
from emot.emo_unicode import EMOTICONS_EMO 
from emot.emo_unicode import EMOJI_UNICODE,UNICODE_EMOJI
from sklearn.metrics import f1_score,accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('movie_reviews')
nltk.download('punkt')

def lower_case(text) :
  return text.lower()

def strip_html_tags(text) :
  """remove html tags from text"""
  soup = BeautifulSoup(text,"html.parser")
  stripped_text = soup.get_text(separator=" ")
  return stripped_text
 
 
def accented_chars_to_ascii(text) :
  """Remove accented characters from text"""
  text = unidecode.unidecode(text)
  return text
  
  
def expand_contractions(text) :
  """expand shortend words, e.g. `don't` to `do not` """
  text = contractions.fix(text)
  return text
  
  
def remove_urls(text) :
  url_pattern = re.compile(r'https?:\/\/\S+|www\.\S+')
  return url_pattern.sub(r'',text)
  
  
def remove_twitter_handles(text) :
  pattern = re.compile(r'@[^\s]+')
  return pattern.sub(r'',text)
  
  
def convert_emoticons(text) :
  for emot in EMOTICONS_EMO:
    text = re.sub(u'('+re.escape(emot)+')', " " + "_".join(EMOTICONS_EMO[emot].replace(",","").split())+" ", text)
  return text
  
  
def convert_emojis(text):
  for emot in UNICODE_EMOJI :
    text = re.sub(r'('+re.escape(emot)+')',"_".join(UNICODE_EMOJI[emot].replace(',','').replace(":","").split()),text)
  return text
  
  
def remove_digts(text) :
  return re.sub(r'\w*\d\w*',' ',text)
  
  
def remove_punctuations(text) :
  return text.translate(str.maketrans('','',string.punctuation))


from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stop_words])  
  
  
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def stem_words(text) :
  return " ".join([stemmer.stem(word) for word in text.split()]) 

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text) :
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def testing_model(model,vectorizer,y_train) :
  pred = model.predict(vectorizer)
  f1 = f1_score(y_train,pred)
  acc = accuracy_score(y_train,pred)
  print(f"F1 Score : {f1}")
  print(f"Acc. : {acc}")

def cleaning_pipeline(x) :
  x=x.fillna("")
  x = x.apply(lower_case)
  x = x.apply(strip_html_tags)
  x = x.apply(accented_chars_to_ascii)
  x = x.apply(remove_urls)
  x = x.apply(remove_twitter_handles)
  # x = x.apply(convert_emoticons)
  # x = x.apply(convert_emojis)
  x = x.apply(remove_digts)
  x = x.apply(remove_punctuations)
  x = x.apply(remove_stopwords)
  x = x.apply(expand_contractions)
  # x = x.apply(stem_words)
  x = x.apply(lemmatize_words)
  return x

def testing_model(model,vectorizer,y_train) :
  pred = model.predict(vectorizer)
  f1 = f1_score(y_train,pred)
  acc = accuracy_score(y_train,pred)
  print(f"F1 Score : {f1}")
  print(f"Acc. : {acc}")

def testing_metrics(y_pred,y_test):
  f1 = f1_score(y_test,y_pred)
  acc = accuracy_score(y_test,y_pred)
  print(f"F1 Score : {f1}")
  print(f"Acc. : {acc}")

def testing_pipeline(x,y,model,vectorizer) :
  pow = vectorizer.transform(x)
  testing_model(model,pow,y)
  # return pow

In [None]:
path = "/content/drive/MyDrive/DataSet/Sentiment140/sentiment140.csv"
df = pd.read_csv(path,encoding='latin')
df.columns = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
df = df[['sentiment','text']]
df.drop_duplicates(inplace=True)
df['sentiment'] = df['sentiment'].apply(lambda x:1 if x == 4 else 0)
df.head()

Unnamed: 0,sentiment,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [None]:
X,y = df['text'],df['sentiment']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train1,X_test1 = cleaning_pipeline(X_train),cleaning_pipeline(X_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(min_df=5).fit(X_train1)
bow1 = CountVectorizer().fit(X_train1)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=5).fit(X_train1)
tfidf1 = TfidfVectorizer().fit(X_train1)

In [None]:
import pickle
pickle.dump(bow,open('Bow_5.pkl','wb'))
pickle.dump(bow1,open('Bow.pkl','wb'))
pickle.dump(tfidf,open('Tfidf_5.pkl','wb'))
pickle.dump(tfidf1,open('Tfidf.pkl','wb'))

In [None]:
from google.colab import files
files.download('/content/Bow.pkl')
files.download('/content/Bow5.pkl')
files.download('/content/Tfidf.pkl')
files.download('/content/Tfidf5.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
x_train_bow = bow.transform(X_train1)
x_train_bow1 = bow1.transform(X_train1)
x_train_tfidf = tfidf.transform(X_train1)
x_train_tfidf1 = tfidf1.transform(X_train1)
x_test_bow = bow.transform(X_test1)
x_test_bow1 = bow1.transform(X_test1)
x_test_tfidf = tfidf.transform(X_test1)
x_test_tfidf1 = tfidf1.transform(X_test1)

# TextBlob

In [None]:
X_test.iloc[0]

'redoinq the myspace '

In [None]:
from textblob import TextBlob

pred = np.zeros(len(X_test))
for i in tqdm(range(len(pred))):
  pred[i] = np.round(TextBlob(X_test.iloc[i]).sentiment.polarity/2+0.5)

print("\nTesting TextBlob without Cleaning Data")
testing_metrics(pred,y_test)

100%|██████████| 316738/316738 [01:42<00:00, 3085.81it/s]



Testing TextBlob without cleaning
F1 Score : 0.5994065480655877
Acc. : 0.6223566480813796


In [None]:
from textblob import TextBlob

pred = np.zeros(len(X_test))
for i in tqdm(range(len(pred))):
  pred[i] = np.round(TextBlob(X_test1.iloc[i]).sentiment.polarity/2+0.5)

print("\nTesting TextBlob with cleaned Data")
testing_metrics(pred,y_test)

100%|██████████| 316738/316738 [01:17<00:00, 4063.08it/s]



Testing TextBlob with cleaned Data
F1 Score : 0.5914649431244623
Acc. : 0.6192373507441482


# Vader

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

pred = np.zeros(len(X_test))
for i in tqdm(range(len(pred))):
  pred[i] = np.round(vader.polarity_scores(X_test.iloc[i])['compound']/2+0.5)

print("\nTesting Vader without Cleaning Data")
testing_metrics(pred,y_test)

100%|██████████| 316738/316738 [00:45<00:00, 6954.33it/s]



Testing Vader without Cleaning Data
F1 Score : 0.6453152845957371
Acc. : 0.6518952572788866


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

pred = np.zeros(len(X_test))
for i in tqdm(range(len(pred))):
  pred[i] = np.round(vader.polarity_scores(X_test1.iloc[i])['compound']/2+0.5)

print("\nTesting Vader with Cleaned Data")
testing_metrics(pred,y_test)

100%|██████████| 316738/316738 [00:32<00:00, 9722.98it/s]



Testing Vader with Cleaned Data
F1 Score : 0.6352670078847329
Acc. : 0.6362008979029987


# Logistic Regression

In [None]:
model = pickle.load(open('/content/Logistic_Regression_bow_lem.pkl','rb'))

"""
  - Logistic Regression
  - Lemmatization
  - Bag of Word
  - Min_df = 0
"""

print("Training :")
testing_model(model,x_train_bow1,y_train)
print("Testing : ")
testing_model(model,x_test_bow1,y_test)

Training :
F1 Score : 0.800756190248842
Acc. : 0.7957767934381097
Testing : 
F1 Score : 0.7858920628312001
Acc. : 0.7791897404163693


In [None]:
model = pickle.load(open('/content/Logistic_Regression_tfidf_lem.pkl','rb'))

"""
  - Logistic Regression
  - Lemmatization
  - TF-IDF
  - Min_df = 0
"""

print("Training :")
testing_model(model,x_train_tfidf1,y_train)
print("Testing : ")
testing_model(model,x_test_tfidf1,y_test)

Training :
F1 Score : 0.8015236925348527
Acc. : 0.7979915576912148
Testing : 
F1 Score : 0.7856056202149633
Acc. : 0.7810935220908132


# Naive Bayes

## Bernoulli

In [None]:
model = pickle.load(open('/content/Bernoulli_NB_bow_lem.pkl','rb'))

"""
  - Bernoulli Naive Bayes
  - Lemmatization
  - Bag of Word
  - Min_df = 0
"""

print("Training :")
testing_model(model,x_train_bow1,y_train)
print("Testing : ")
testing_model(model,x_test_bow1,y_test)

Training :
F1 Score : 0.8051521595475144
Acc. : 0.8035742474853033
Testing : 
F1 Score : 0.7733481723732724
Acc. : 0.7698160624869766


In [None]:
model = pickle.load(open('/content/Bernoulli_NB_tfidf_lem.pkl','rb'))

"""
  - Bernoulli Naive Bayes
  - Lemmatization
  - TF-IDF
  - Min_df = 0
"""

print("Training :")
testing_model(model,x_train_tfidf1,y_train)
print("Testing : ")
testing_model(model,x_test_tfidf1,y_test)

Training :
F1 Score : 0.8051521595475144
Acc. : 0.8035742474853033
Testing : 
F1 Score : 0.7733481723732724
Acc. : 0.7698160624869766


## Multinomial

In [None]:
model = pickle.load(open('/content/MultinomialNB_bow_lem.pkl','rb'))

"""
  - Multinomial Naive Bayes
  - Lemmatization
  - Bag of Word
  - Min_df = 0
"""

print("Training :")
testing_model(model,x_train_bow1,y_train)
print("Testing : ")
testing_model(model,x_test_bow1,y_test)

Training :
F1 Score : 0.8019615071343096
Acc. : 0.8036776452462288
Testing : 
F1 Score : 0.7679696471737737
Acc. : 0.7690772815386849


In [None]:
model = pickle.load(open('/content/MultinomialNB_tfidf_lem.pkl','rb'))

"""
  - Multinomial Naive Bayes
  - Lemmatization
  - TF-IDF
  - Min_df = 0
"""

print("Training :")
testing_model(model,x_train_tfidf1,y_train)
print("Testing : ")
testing_model(model,x_test_tfidf1,y_test)

Training :
F1 Score : 0.8030909260426536
Acc. : 0.8049641975386597
Testing : 
F1 Score : 0.7617398487640592
Acc. : 0.7632428063573048


# Gradient Boosting Classifier

In [None]:
model = pickle.load(open('/content/GBC_Classifier_tfidf_lem2.pkl','rb'))

"""
  - Gradient Boosting
  - Lemmatization
  - TF-IDF
  - Min_df = 5
  - lr = 1.5
  - n = 150
  - depth = 10
"""

print("Training :")
testing_model(model,x_train_tfidf,y_train)
print("Testing : ")
testing_model(model,x_test_tfidf,y_test)

Training :
F1 Score : 0.8065127112261151
Acc. : 0.8002773585739633
Testing : 
F1 Score : 0.7779714526018113
Acc. : 0.7699676072968826
