<a href="https://colab.research.google.com/github/sureshmrd/Twitter-Sentiment-Analysis-NLP/blob/main/Twitter_Sentiment_Analysis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle



In [2]:
#configure the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Importing Twitter Sentiment Analysis Dataset

In [3]:
#API to fetch the dataset from the kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 96% 78.0M/80.9M [00:00<00:00, 120MB/s]
100% 80.9M/80.9M [00:00<00:00, 94.7MB/s]


In [4]:
#extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


Importing the dependencies

In [7]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#from nltk.lemma import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
#printing the english stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data processing

In [12]:
#import the dataset
columns = ['target','ids','date','flag','user','text']
twitter_data=pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=columns)

In [13]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [14]:
twitter_data.shape

(1600000, 6)

In [15]:
twitter_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [16]:
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [18]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


0 --> negative review
4 --> positive review

changing postive review from 4 to 1 , for better understanding

In [19]:
twitter_data.replace({'target':{4:1}},inplace=True)

In [20]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0 --> negative review
1 --> positive review

In [21]:
#stemming

port_stem = PorterStemmer()

def stemming(corpus_text):
  stemmed_corpus = re.sub('[^a-zA-Z]',' ',corpus_text)
  stemmed_corpus = stemmed_corpus.lower()
  stemmed_corpus = stemmed_corpus.split()
  stemmed_corpus = [port_stem.stem(word) for word in stemmed_corpus if not word in stopwords.words('english')]
  stemmed_corpus = ' '.join(stemmed_corpus)
  return stemmed_corpus


In [22]:
twitter_data['stemmed_text']=twitter_data['text'].apply(stemming)

In [23]:
print(twitter_data['stemmed_text'],twitter_data['target'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_text, Length: 1600000, dtype: object 0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [24]:
X=twitter_data['stemmed_text'].values
y=twitter_data['target'].values

In [25]:
print(X,y)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h'] [0 0 0 ... 1 1 1]


In [27]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=4)

In [28]:
print(X.shape,x_train.shape,x_test.shape)

(1600000,) (1280000,) (320000,)


In [29]:
print(y.shape,y_train.shape,y_test.shape)

(1600000,) (1280000,) (320000,)


In [30]:
print(x_train)

['realli half minut ago sinc updat im xd aaah love shame cnt go letter tho'
 'tad depress realiti best week ever realli sink' 'joannahelm hear tgif'
 ... 'still hive might life waaa realli autoimmun' 'get readi'
 'sleepytim now wanna wait dad come home though mum drive crazi usual nightnight x']


In [31]:
print(x_test)

['jordanknight bozo clown' 'ricopremier took tylenol make drowsi'
 'sigh guess start pay' ...
 'twitbotnew avi kay would like appli assimov law robot twitter bot talk feel http bit ly jebuk'
 'need bigger ipod mb left delet mb worth make room babi darl doll face honey'
 'wine festiv today bore come drink draw merri']


In [32]:
print(y_train)

[0 0 1 ... 0 1 0]


In [33]:
print(y_test)

[1 0 0 ... 1 0 1]


In [34]:
vectorizer = TfidfVectorizer()
X_train=vectorizer.fit_transform(x_train)
X_test=vectorizer.transform(x_test)

In [35]:
print(X_train)

  (0, 407122)	0.2449378566345004
  (0, 233063)	0.32899012562742846
  (0, 150078)	0.13596830703457027
  (0, 77292)	0.34872708210000086
  (0, 361978)	0.28708270745925357
  (0, 242291)	0.15564710957377614
  (0, 363)	0.3713917824436101
  (0, 448836)	0.27098307408623185
  (0, 178191)	0.17416151253892917
  (0, 426518)	0.2397690303816763
  (0, 369099)	0.23778120529817967
  (0, 6242)	0.2593366351316874
  (0, 267342)	0.253746288012446
  (0, 158389)	0.2586505758591688
  (0, 335197)	0.17539295254819243
  (1, 369392)	0.46684901713605176
  (1, 124750)	0.28490203501945915
  (1, 438496)	0.24862635890744317
  (1, 39596)	0.27427987547601074
  (1, 335146)	0.41058423560799007
  (1, 98223)	0.35748595116888926
  (1, 393814)	0.4672392358157062
  (1, 335197)	0.22169841205474725
  (2, 400456)	0.5169109837538088
  (2, 162250)	0.337186327683649
  :	:
  (1279996, 453701)	0.2766279169821965
  (1279996, 446341)	0.25784921506074365
  (1279996, 375869)	0.2995561917098937
  (1279997, 28220)	0.5903579970418673
  (1279

In [36]:
print(X_test)

  (0, 202636)	0.43812621133556234
  (0, 76620)	0.5292805525416694
  (0, 49732)	0.7265724462446476
  (1, 422342)	0.589130338879642
  (1, 413338)	0.3907540953315867
  (1, 249396)	0.2725491608112262
  (1, 109866)	0.6526512359095383
  (2, 382262)	0.4236127291210829
  (2, 367853)	0.5596958576057355
  (2, 311123)	0.5357663703607016
  (2, 155744)	0.4693050171222206
  (3, 445626)	0.3345637040340407
  (3, 438948)	0.1873983153969061
  (3, 435623)	0.19873137414154904
  (3, 358703)	0.1802954672017031
  (3, 337621)	0.2855099603443114
  (3, 275099)	0.5434217568369408
  (3, 169708)	0.18539253779459292
  (3, 168865)	0.1897781207255086
  (3, 150078)	0.14176162309909768
  (3, 120758)	0.26148906614865175
  (3, 74461)	0.28255440351217553
  (3, 60521)	0.22304891131105312
  (3, 39686)	0.3401152981740469
  (4, 445689)	0.257082503746696
  :	:
  (319997, 43111)	0.1749267591001244
  (319997, 28625)	0.3658375538604147
  (319997, 21511)	0.2879505858634176
  (319998, 446248)	0.22917000584333636
  (319998, 345411)	

In [38]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

In [39]:
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,y_train)
print("Accuracy score on Training Data is: ",training_data_accuracy)

Accuracy score on Training Data is:  0.8100171875


In [40]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,y_test)
print("Accuracy score on Testing Data is: ",testing_data_accuracy)

Accuracy score on Testing Data is:  0.779396875


In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [41]:
import pickle

filename = 'twitter_sentiment_analysis_model.sav'
pickle.dump(model,open(filename,'wb'))

In [42]:
#using the saved model

loaded_model = pickle.load(open(filename,'rb'))

x_new = X_test[100]
print(y_test[100])

prediction = loaded_model.predict(x_new)
print(prediction)



1
[1]
