## Importing Libraries

In [74]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91954\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Fetching dataset

In [3]:
twitter_data=pd.read_csv("C:\\Users\\91954\\OneDrive\\Desktop\\datasets\\twitter data\\training.1600000.processed.noemoticon.csv",encoding='ISO-8859-1')

In [4]:
twitter_data

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


### Data Manipulation and cleaning

In [29]:
column_names=['target','id','date','flag','user','text']

In [30]:
twitter_data=pd.read_csv("C:\\Users\\91954\\OneDrive\\Desktop\\datasets\\twitter data\\training.1600000.processed.noemoticon.csv",names=column_names,encoding='ISO-8859-1')

In [31]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [32]:
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [33]:
twitter_data['target']=twitter_data['target'].map({0:0,4:1})

In [34]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [35]:
port_stem=PorterStemmer()

In [36]:
data1=twitter_data[:10000]
data2=twitter_data[800001:810001]

#### Stemming

In [37]:
port_stem=PorterStemmer()

In [38]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [39]:
df= pd.concat([data1, data2], axis=0, ignore_index=True)

In [40]:
df.shape

(20000, 6)

In [41]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [44]:
df['target'].value_counts()

target
0    10000
1    10000
Name: count, dtype: int64

In [45]:
df['stemmed_content']=df['text'].apply(stemming)

In [46]:
df

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see
...,...,...,...,...,...,...,...
19995,1,1469642843,Tue Apr 07 06:59:39 PDT 2009,NO_QUERY,Codepope,@bensummers Isn't that sweet of them.... Altru...,bensumm sweet altruism finest
19996,1,1469642883,Tue Apr 07 06:59:40 PDT 2009,NO_QUERY,christyku,"@jakrose Um, milk *fathers* don't have udders....",jakros um milk father udder quot milk mother q...
19997,1,1469643029,Tue Apr 07 06:59:42 PDT 2009,NO_QUERY,EdRoberts,@zenaweist They could also tweet @BeccaRoberts,zenaweist could also tweet beccarobert
19998,1,1469643036,Tue Apr 07 06:59:42 PDT 2009,NO_QUERY,celeloriel,"Good lord, I still have 125 work emails to cat...",good lord still work email catch actual read t...


In [51]:
x=df['stemmed_content'].values
y=df['target'].values

In [50]:
x

array(['switchfoot http twitpic com zl awww bummer shoulda got david carr third day',
       'upset updat facebook text might cri result school today also blah',
       'kenichan dive mani time ball manag save rest go bound', ...,
       'zenaweist could also tweet beccarobert',
       'good lord still work email catch actual read teach go vacat',
       'gig northampton racehors tmw night'], dtype=object)

In [52]:
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [53]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [54]:
print(x.shape,x_train.shape,x_test.shape)

(20000,) (16000,) (4000,)


In [55]:
vectorizer=TfidfVectorizer()

x_train=vectorizer.fit_transform(x_train)
x_test=vectorizer.transform(x_test)

In [57]:
print(x_train)

  (0, 17404)	0.40994476934328805
  (0, 886)	0.43654144339515966
  (0, 16121)	0.3741350912295178
  (0, 5724)	0.4820087579992285
  (0, 8158)	0.518717018237565
  (1, 10259)	0.5219814371874156
  (1, 10178)	0.302371556590872
  (1, 13684)	0.3320574362513538
  (1, 19528)	0.3578843388979241
  (1, 7478)	0.3085066250923735
  (1, 532)	0.3983917764837187
  (1, 12278)	0.24711687963068293
  (1, 6424)	0.28775750642546605
  (2, 4174)	0.47211018945106925
  (2, 3387)	0.20070709048405125
  (2, 17735)	0.28960868657286326
  (2, 7839)	0.19426229695857355
  (2, 952)	0.2818268005177154
  (2, 9434)	0.3381806652509717
  (2, 4148)	0.17391851791629573
  (2, 12680)	0.2719587004137518
  (2, 10077)	0.187488915141534
  (2, 18801)	0.2916421719539334
  (2, 5310)	0.3874114872872652
  (2, 12192)	0.2166125522636099
  :	:
  (15997, 5388)	0.27774108950380194
  (15997, 19144)	0.32679870960547686
  (15997, 14560)	0.28755997393166416
  (15997, 16031)	0.3330290077031304
  (15997, 5714)	0.2907149633895632
  (15997, 13429)	0.2941

In [59]:
print(x_test)

  (0, 19501)	0.18588059660865938
  (0, 19461)	0.3720034801707212
  (0, 19130)	0.25545528802736633
  (0, 17847)	0.2511257225305132
  (0, 13977)	0.3963369556067868
  (0, 12211)	0.2825234600602973
  (0, 11582)	0.3276456875324955
  (0, 10303)	0.2316170411640318
  (0, 7674)	0.3120478265342367
  (0, 7104)	0.41511645523926394
  (0, 6797)	0.17647698493625144
  (1, 12637)	0.45275733902635995
  (1, 2212)	0.8916337768152179
  (2, 15316)	0.40960678099769354
  (2, 14031)	0.27610136302869975
  (2, 12185)	0.2640528209867101
  (2, 6177)	0.3796703295921933
  (2, 5768)	0.3098384433366736
  (2, 3592)	0.36143095478166576
  (2, 3458)	0.31609310398214935
  (2, 2745)	0.3572871813545398
  (2, 813)	0.2965058084788493
  (3, 18268)	0.3635577661822065
  (3, 10303)	0.3095177315550399
  (3, 9463)	0.5760631815697946
  :	:
  (3997, 8156)	0.317080025147229
  (3997, 6253)	0.3779933090428832
  (3997, 5951)	0.5618325134981152
  (3997, 5476)	0.30173416368269973
  (3997, 4917)	0.3779933090428832
  (3997, 3648)	0.1992410948

### ML model

In [60]:
model=LogisticRegression(max_iter=1000)

In [61]:
model.fit(x_train,y_train)

In [65]:
y_pred=model.predict(x_train)

In [66]:
accur=accuracy_score(y_pred,y_train)

In [70]:
print(f'Accuracy using Logistic Regression on Training data {accur}')

Accuracy using Logistic Regression on Testing data 0.85075


In [71]:
y_pred_test=model.predict(x_test)

In [72]:
accur_test=accuracy_score(y_pred_test,y_test)

In [73]:
print(f'Accuracy using Logistic Regression on Testing data {accur_test}')

Accuracy using Logistic Regression on Testing data 0.7375


In [75]:
model2=RandomForestClassifier()

In [76]:
model2.fit(x_train,y_train)

In [80]:
y_pred=model2.predict(x_train)

In [81]:
accur=accuracy_score(y_pred,y_train)

In [82]:
print(f'Accuracy using Random Forest on Training data {accur}')

Accuracy using Random Forest on Training data 0.998375


In [83]:
y_pred_test=model.predict(x_test)

In [84]:
accur_test=accuracy_score(y_pred_test,y_test)

In [85]:
print(f'Accuracy using Random Forest on Testing data {accur_test}')

Accuracy using Random Forest on Testing data 0.7375
