In [1]:
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


**Downloading Kaggle Dataset**

In [2]:
!kaggle datasets download -d kazanova/sentiment140 --unzip


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 96% 78.0M/80.9M [00:03<00:00, 33.0MB/s]
100% 80.9M/80.9M [00:03<00:00, 23.1MB/s]


**Importing Dependencies**

In [3]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
#printing stopwords in the english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

**Data Processing**

In [7]:
#Load the dataset csv file
Twitter_data=pd.read_csv('/content/twitterdataset.csv',encoding='latin-1')

In [8]:
#checking number of columns and rows
Twitter_data.shape

(1599999, 6)

In [9]:
#printing first five rows
Twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [10]:
#set the column names
column_names=['target','id','date','flag','user','text']
Twitter_data=pd.read_csv('/content/twitterdataset.csv',names=column_names,encoding='latin-1')


In [11]:
#check columns and rows
Twitter_data.shape

(1600000, 6)

In [12]:
#printing first Few rows
Twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
#counnting the number of missing values
Twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [14]:
#checking the distribution of columns
Twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


**converting  the target 4 to 1**

In [15]:
Twitter_data.replace({'target':{4:1}},inplace=True)

In [16]:
#checking the distribution of columns
Twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0---->Negative tweet

1---->Positive tweet


**Stemming**

stemming is the process of reducing a word to key or root word

example: actor,actress,acting=act


In [17]:
port_stem=PorterStemmer()

In [18]:
def stemming(content):
  stemmed_content=re.sub('[^a-zA-z]',',',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=''.join(stemmed_content)
  return stemmed_content

In [19]:
Twitter_data['stemmed_content']=Twitter_data['text'].apply(stemming)
#15 minutes to complete execution

In [20]:
Twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",",switchfoot,http,,,twitpic,com,,y,zl,,,awww,,t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"is,upset,that,he,can,t,update,his,facebook,by,..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,",kenichan,i,dived,many,times,for,the,ball,,man..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"my,whole,body,feels,itchy,and,like,its,on,fire,"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",",nationwideclass,no,,it,s,not,behaving,at,all,..."


In [21]:
print(Twitter_data['stemmed_content'])

0          ,switchfoot,http,,,twitpic,com,,y,zl,,,awww,,t...
1          is,upset,that,he,can,t,update,his,facebook,by,...
2          ,kenichan,i,dived,many,times,for,the,ball,,man...
3            my,whole,body,feels,itchy,and,like,its,on,fire,
4          ,nationwideclass,no,,it,s,not,behaving,at,all,...
                                 ...                        
1599995    just,woke,up,,having,no,school,is,the,best,fee...
1599996    thewdb,com,,,very,cool,to,hear,old,walt,interv...
1599997    are,you,ready,for,your,mojo,makeover,,ask,me,f...
1599998    happy,,,th,birthday,to,my,boo,of,alll,time,,,,...
1599999    happy,,charitytuesday,,thenspcc,,sparkscharity...
Name: stemmed_content, Length: 1600000, dtype: object


In [22]:
print(Twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [23]:
#separating Data and labels
X=Twitter_data['stemmed_content'].values
Y=Twitter_data['target'].values

In [24]:
print(X)

[',switchfoot,http,,,twitpic,com,,y,zl,,,awww,,that,s,a,bummer,,,you,shoulda,got,david,carr,of,third,day,to,do,it,,,d'
 'is,upset,that,he,can,t,update,his,facebook,by,texting,it,,,,and,might,cry,as,a,result,,school,today,also,,blah,'
 ',kenichan,i,dived,many,times,for,the,ball,,managed,to,save,,,,,,the,rest,go,out,of,bound'
 ... 'are,you,ready,for,your,mojo,makeover,,ask,me,for,details,'
 'happy,,,th,birthday,to,my,boo,of,alll,time,,,,tupac,amaru,shakur,'
 'happy,,charitytuesday,,thenspcc,,sparkscharity,,speakinguph,h,']


In [25]:
print(Y)

[0 0 0 ... 1 1 1]


**Splitting dataset into Training and Splitting**

In [26]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [27]:
print(X.shape,X_train.shape,X_test.shape)

(1600000,) (1280000,) (320000,)


In [28]:
print(X_train)

['about,to,watch,saw,iv,and,drink,a,lil,wine,' ',hatermagazine,i,m,in,,'
 'even,though,its,my,favourite,drink,i,think,its,the,vodka,and,coke,that,wipes,my,mind,all,the,time,,think,im,gonna,have,to,find,a,new,drink'
 ... 'is,eager,for,monday,afternoon,'
 'hope,everyone,and,their,mother,had,a,great,day,,,can,t,wait,to,hear,what,the,guys,have,in,store,tomorrow,'
 'i,love,waking,up,to,folgers,,too,bad,my,voice,was,deeper,than,his,,']


In [None]:
print(X_test)

In [29]:
#converting textual data into numerical data
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [30]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15094371 stored elements and shape (1280000, 531739)>
  Coords	Values
  (0, 4249)	0.24172931494985225
  (0, 475071)	0.12137942087229907
  (0, 504558)	0.304177217511959
  (0, 409785)	0.32909372343094934
  (0, 216431)	0.48529996213744236
  (0, 20677)	0.14949730470406813
  (0, 128781)	0.38373843395523044
  (0, 271967)	0.38710343110393963
  (0, 511785)	0.4135671246561763
  (1, 188178)	0.9765216325327424
  (1, 209246)	0.2154193612366067
  (2, 475071)	0.06900097394211752
  (2, 20677)	0.08498524339768355
  (2, 128781)	0.4362901963388204
  (2, 146253)	0.16656958415950338
  (2, 470517)	0.15947576994723203
  (2, 215350)	0.2837353574234843
  (2, 324647)	0.16759065904102674
  (2, 152292)	0.2506875641056442
  (2, 469532)	0.2864562259915306
  (2, 463803)	0.14227063911921747
  (2, 500987)	0.2817562980379064
  (2, 91494)	0.2683341390870039
  (2, 463397)	0.10229834155030085
  (2, 512194)	0.3536729077223584
  :	:
  (1279998, 110566)	0.1853836

In [31]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3688791 stored elements and shape (320000, 531739)>
  Coords	Values
  (0, 19391)	0.14882284106744512
  (0, 37875)	0.14084958903895045
  (0, 79911)	0.24232048946667173
  (0, 124656)	0.18107307206461426
  (0, 125033)	0.37496268780890596
  (0, 155718)	0.2216719829384014
  (0, 160145)	0.09903792187752404
  (0, 162536)	0.20529174002328723
  (0, 184322)	0.14616867008002488
  (0, 188442)	0.20153815233034333
  (0, 200366)	0.24421164510281854
  (0, 214066)	0.0961166392814001
  (0, 312999)	0.3924918221460324
  (0, 322233)	0.15426392341656062
  (0, 324647)	0.08813377214182222
  (0, 347767)	0.10680300290102243
  (0, 448938)	0.1911909918075984
  (0, 461212)	0.36019964556553297
  (0, 463803)	0.07481829931544681
  (0, 472675)	0.2810387264721657
  (0, 475071)	0.14514689899249508
  (0, 486480)	0.1580808774099091
  (1, 10209)	0.5302734435008909
  (1, 19391)	0.18192140127557985
  (1, 32428)	0.1462096253421788
  :	:
  (319995, 482697)	0.2415578

**Training Logistic Machine Learning Model**

**Logistic Regression Model**

In [32]:
model=LogisticRegression(max_iter=1000)

In [33]:
model.fit(X_train,Y_train)

**Model Evaluation**

Accuracy Score

In [34]:
#Accuracy score on Training data
X_train_prediction=model.predict(X_train)
training_data_accuracy_score=accuracy_score(X_train_prediction,Y_train)

In [35]:
#training data accuracy score
print('Accuracy score on training data:',training_data_accuracy_score)

Accuracy score on training data: 0.82000546875


In [36]:
#Accuracy score on Test data
X_test_prediction=model.predict(X_test)
test_data_accuracy_score=accuracy_score(X_test_prediction,Y_test)

In [37]:
#test data accuracy score
print('Accuracy score on test data:',test_data_accuracy_score)

Accuracy score on test data: 0.8016375


**Model Accuracy=80%**

Saving the training model

In [38]:
import pickle

In [39]:
Filename='trained_model.sav'
pickle.dump(model,open(Filename,'wb'))

**Using saved model for future prediction**

In [45]:
#loaded model
loaded_model=pickle.load(open('/content/trained_model.sav','rb'))

In [41]:
X_new=X_test[200]
print(Y_test[200])
prediction=loaded_model.predict(X_new)
print(prediction)

if prediction[0]==0:
  print('Negative Tweet')
else:
  print('Positive tweet')


1
[1]
Positive tweet


In [42]:
X_new=X_test[3]
print(Y_test[3])
prediction=loaded_model.predict(X_new)
print(prediction)

if prediction[0]==0:
  print('Negative Tweet')
else:
  print('Positive tweet')


0
[0]
Negative Tweet


In [43]:
X_new=X_test[450]
print(Y_test[450])
prediction=loaded_model.predict(X_new)
print(prediction)

if prediction[0]==0:
  print('Negative Tweet')
else:
  print('Positive tweet')


1
[1]
Positive tweet
