<a href="https://colab.research.google.com/github/valievav/ML-projects/blob/main/Twitter_sentiment_analysis_using_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
# install kaggle to use data via Kaggle API
! pip install kaggle




In [32]:
# store Kaggle API access key
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [33]:
# call API to fetch dataset from Kaggle (faster than download)
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [34]:
# extract compressed dataset
from zipfile import ZipFile

zipped_dataset = '/content/sentiment140.zip'

with ZipFile(zipped_dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')



The dataset is extracted


In [35]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords  # natural language toolkit
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [36]:
# get stopwords package
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
# see stopwords - words that add NO contexual importance, so can be excluded for ML model
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [38]:
# loading data from csv file to pandas dataframe
data_path = '/content/training.1600000.processed.noemoticon.csv'
header_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv(data_path, names=header_names, encoding='iso-8859-1')

In [39]:
# check data
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [40]:
# check for missing values
df.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [41]:
# check distribution of data
# if distribution is not even we'll need to do upsampling or downsampling

df['target'].value_counts()  # distributionis even between 0 and 4

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [42]:
# Stemming - process of reducing words into root words

port_stem = PorterStemmer()

def stemming(content):
  pattern = '[^a-zA-Z]'
  cleaned_content = re.sub(pattern, ' ', content)
  cleaned_content = cleaned_content.lower().split()
  stemmed_context = [port_stem.stem(word) for word in cleaned_content if not word in stopwords.words('english')]
  res = ' '.join(stemmed_context)
  return res

df['stemmed_text'] = df['text'].apply(stemming)  # !!! 50 min to complete execution

In [44]:
# data with new colummn
df.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [45]:
#separating data and label
X = df['stemmed_text'].values
Y = df['target'].values


In [46]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [47]:
print(Y)

[0 0 0 ... 4 4 4]


In [48]:
# splitting data to test and train
X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.2, # 20% of data will go to test data, 80% to train data
    stratify=Y, # equal proportion of the 0 and 4 in both training and test data
    random_state=2, # to ensure reproducable split on each next run
    )


In [49]:
# check if split done 80%-20% as per setup
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [50]:
# convert text into numercal values, since models do not read textual data
vectorizer = TfidfVectorizer()  # assigns importance value to words based on how often they are repeated

X_train = vectorizer.fit_transform(X_train)  # fit - describes the data, calc mean and variance for each feature; transform - to numerical values
X_test = vectorizer.transform(X_test)  # not using fit - so model works with new unseen; transform - to numerical values


In [51]:
print(X_train)

  (0, 436713)	0.27259876264838384
  (0, 354543)	0.3588091611460021
  (0, 185193)	0.5277679060576009
  (0, 109306)	0.3753708587402299
  (0, 235045)	0.41996827700291095
  (0, 443066)	0.4484755317023172
  (1, 160636)	1.0
  (2, 109306)	0.4591176413728317
  (2, 124484)	0.1892155960801415
  (2, 407301)	0.18709338684973031
  (2, 129411)	0.29074192727957143
  (2, 406399)	0.32105459490875526
  (2, 433560)	0.3296595898028565
  (2, 77929)	0.31284080750346344
  (2, 443430)	0.3348599670252845
  (2, 266729)	0.24123230668976975
  (2, 409143)	0.15169282335109835
  (2, 178061)	0.1619010109445149
  (2, 150715)	0.18803850583207948
  (2, 132311)	0.2028971570399794
  (2, 288470)	0.16786949597862733
  (3, 406399)	0.29029991238662284
  (3, 158711)	0.4456939372299574
  (3, 151770)	0.278559647704793
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 318303)	0.21254698865277744
  (1279996, 434014)	0.27189450523324465
  (1279996, 390130)	0.2206474219107611
  (1279996, 373144)	0.35212500999832036
  (1279996, 23807

In [52]:
print(X_test)

  (0, 15110)	0.1719352837797837
  (0, 31168)	0.1624772418052177
  (0, 67828)	0.26800375270827315
  (0, 106069)	0.36555450010904555
  (0, 132364)	0.255254889555786
  (0, 138164)	0.23688292264071406
  (0, 171378)	0.2805816206356074
  (0, 271016)	0.45356623916588285
  (0, 279082)	0.17825180109103442
  (0, 388348)	0.2198507607206174
  (0, 398906)	0.34910438732642673
  (0, 409143)	0.3143047059807971
  (0, 420984)	0.17915624523539805
  (1, 6463)	0.30733520460524466
  (1, 15110)	0.211037449588008
  (1, 145393)	0.575262969264869
  (1, 217562)	0.40288153995289894
  (1, 256777)	0.28751585696559306
  (1, 348135)	0.4739279595416274
  (1, 366203)	0.24595562404108307
  (2, 22532)	0.3532582957477176
  (2, 34401)	0.37916255084357414
  (2, 89448)	0.36340369428387626
  (2, 183312)	0.5892069252021465
  (2, 256834)	0.2564939661498776
  :	:
  (319994, 443794)	0.2782185641032538
  (319995, 107868)	0.33399349737546963
  (319995, 109379)	0.3020896484890833
  (319995, 155493)	0.2770682832971669
  (319995, 2133

In [53]:
# train ML model
model = LogisticRegression(max_iter=1000)  # classify datapoints into different classes (positive-negative tweets)
model.fit(X_train, Y_train)


In [54]:
# model evaluation - get accuracy score on the train data
Y_train_prediction = model.predict(X_train)
train_data_accuracy = accuracy_score(Y_train, Y_train_prediction)
print(f'Accuracy score on the TRAIN data: {train_data_accuracy}')


Accuracy score on the TRAIN data: 0.79871953125


In [55]:
# model evaluation - get accuracy score on the test data
Y_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, Y_test_prediction)
print(f'Accuracy score on the TEST data: {test_data_accuracy}')

print(f'Model accuracy: {test_data_accuracy}')

Accuracy score on the TEST data: 0.77668125
Model accuracy: 0.77668125


In [58]:
# save trained model
import pickle

filename = 'trained_model_result.sav'
pickle.dump(model, open(filename, 'wb'))  # write in binary format


In [59]:
# use saved model for future predictions (no need to rerun the training steps)
saved_file = f'/content/{filename}'
loaded_model = pickle.load(open(saved_file, 'rb'))  # read in binary format


In [73]:
# check that loaded_model gives same results as initial model on example
Y_test_pred_orig = model.predict(X_test)
Y_test_pred_loaded = loaded_model.predict(X_test)

if not np.array_equal(Y_test_pred_orig, Y_test_pred_loaded):
  print(f'WARNING!!! Detected differences in predictions original model vs loaded')
else:
  print('Predictions between original and loaded models are the same')


Predictions between original and loaded models are the same
