In [1]:
#re - regular expression
#nltk - natural language tool
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Teena
[nltk_data]     goyal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#printing the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
twitter_data = pd.read_csv("twitter_data1.csv")

In [5]:
columns = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv("twitter_data1.csv", names = columns)

In [6]:
#checking the number of rows and cols
twitter_data.shape

(80001, 6)

In [7]:
#printing the first five rows of dataset
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,NO_QUERY,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...
2,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,NO_QUERY,Young_J,I'm off too bed. I gotta wake up hella early t...
3,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,NO_QUERY,dougnawoschik,I havent been able to listen to it yet My spe...
4,0,1993474319,Mon Jun 01 10:26:09 PDT 2009,NO_QUERY,thireven,now remembers why solving a relatively big equ...


In [8]:
#counting the number of missing values
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
#distribution of target columns
#0 - negative
#4 - positive
twitter_data['target'].value_counts()

target
4    40001
0    40000
Name: count, dtype: int64

In [10]:
#replacing 4 with 1
twitter_data.replace({'target':{4:1}}, inplace = True)

In [11]:
#distribution of target columns
twitter_data['target'].value_counts()

target
1    40001
0    40000
Name: count, dtype: int64

##### The following values represent negative and positive tweet:
- 0 -> Negative Tweet
- 1 -> Positive Tweet

**STEMMING**: It is a process of reducing a word to its root word.
- Eg. actor, actress, acting -> act

In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):

  #re - regular expression

  stemmed_content = re.sub('[^a-zA-Z]', ' ' , content)

  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [15]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,NO_QUERY,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...,nkluvr eva poor littl dumpl holmdel vid realli...
2,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,NO_QUERY,Young_J,I'm off too bed. I gotta wake up hella early t...,bed gotta wake hella earli tomorrow morn
3,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,NO_QUERY,dougnawoschik,I havent been able to listen to it yet My spe...,havent abl listen yet speaker bust
4,0,1993474319,Mon Jun 01 10:26:09 PDT 2009,NO_QUERY,thireven,now remembers why solving a relatively big equ...,rememb solv rel big equat two unknown total pa...


In [16]:
print(twitter_data['stemmed_content'])

0        switchfoot http twitpic com zl awww bummer sho...
1        nkluvr eva poor littl dumpl holmdel vid realli...
2                 bed gotta wake hella earli tomorrow morn
3                       havent abl listen yet speaker bust
4        rememb solv rel big equat two unknown total pa...
                               ...                        
79996                   joshuawaldorf seriou issu let talk
79997                                        download movi
79998    shellykram cadillac chri true grill master bre...
79999    love sunni day walk dad shop gonna go romford ...
80000    clair de lune ever tri send feedback note goog...
Name: stemmed_content, Length: 80001, dtype: object


In [17]:
print(twitter_data['target'])

0        0
1        0
2        0
3        0
4        0
        ..
79996    0
79997    1
79998    1
79999    1
80000    1
Name: target, Length: 80001, dtype: int64


In [18]:
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [19]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'nkluvr eva poor littl dumpl holmdel vid realli tri hope dont tri hard tonight xx'
 'bed gotta wake hella earli tomorrow morn' ...
 'shellykram cadillac chri true grill master breakzqueen xmason taramonst'
 'love sunni day walk dad shop gonna go romford get smoothi zumo get pair shoe'
 'clair de lune ever tri send feedback note googl wordpress issu']


In [20]:
print(Y)

[0 0 0 ... 1 1 1]


In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(80001,) (64000,) (16001,)


In [23]:
print(X_train)

['funmiosiyal lol funni' 'mani adventur tonight good see mama mo'
 'spend time kitten puppi chicken babi calf like countri' ...
 'scottgingold yeah well display disappoint right wing pundit thought beck real'
 'glad dalla irv ft worth traffic like like haha chill time'
 'right get lazi morn chicken lump wine make today']


In [24]:
print(X_test)

['garageglamor thank pic comment what' 'home second foot surgeri mad pain'
 'drove twoblu last time ever' ... 'bbqnyc that right unc game'
 'jessebr heck see' 'bad woman stole mom entir bag insid church']


In [25]:
#converting the textual data to numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [26]:
print(X_train)

  (0, 17586)	0.46706256360621773
  (0, 29180)	0.31339037544285037
  (0, 17579)	0.8268246695989285
  (1, 32930)	0.45163505376571306
  (1, 30456)	0.469560953518571
  (1, 43778)	0.2532485775363915
  (1, 18852)	0.22048980039378344
  (1, 50561)	0.2925350452557704
  (1, 607)	0.5020816112399025
  (1, 30563)	0.353730120276495
  (2, 10152)	0.3518812553040322
  (2, 28609)	0.18413407841383458
  (2, 7146)	0.4906332077122054
  (2, 3518)	0.27439089805983335
  (2, 8489)	0.33880601579259206
  (2, 39849)	0.3518812553040322
  (2, 26868)	0.384857302241266
  (2, 50105)	0.19402282019016767
  (2, 46355)	0.31920680110193983
  (3, 18541)	0.4840791062471063
  (3, 43179)	0.8750242390328412
  (4, 17334)	0.31531056472774466
  (4, 11738)	0.43053591964895577
  (4, 33341)	0.29661960429221046
  (4, 54619)	0.30124343946742504
  :	:
  (63997, 49867)	0.21702043300887175
  (63997, 54564)	0.30083722469780644
  (63997, 40798)	0.24102286640480722
  (63997, 55842)	0.19864941466689695
  (63997, 41702)	0.19103154978525902
  (6

In [27]:
print(X_test)

  (0, 54125)	0.43089382597581993
  (0, 49165)	0.2529445626690798
  (0, 38344)	0.3599664481726632
  (0, 17846)	0.6644841302746061
  (0, 9645)	0.4233610229523088
  (1, 47825)	0.4790722736854432
  (1, 43748)	0.4073131002479347
  (1, 37157)	0.38062827543811695
  (1, 30147)	0.41190658167181543
  (1, 21196)	0.27403797377433525
  (1, 16926)	0.46362060760321866
  (2, 50105)	0.34376609681835046
  (2, 27820)	0.3887016999322384
  (2, 15360)	0.47469747671812007
  (2, 13610)	0.7109136127164217
  (3, 48924)	0.49510321688900893
  (3, 41207)	0.4331411996227813
  (3, 39695)	0.5438676320387416
  (3, 21344)	0.25688897440807984
  (3, 16869)	0.2859800091424865
  (3, 6935)	0.35170014761795787
  (4, 20799)	1.0
  (5, 55853)	0.33994034554608743
  (5, 53558)	0.5384740341533512
  (5, 51050)	0.40574387216151653
  :	:
  (15997, 53959)	0.13948812739352312
  (15997, 52450)	0.4972125984649937
  (15997, 48487)	0.26832881157183913
  (15997, 47445)	0.1698286981923086
  (15997, 31251)	0.17513107441133913
  (15997, 10808)

**Training the Machine Learning Model:**
- Logistic Regression

In [28]:
model = LogisticRegression(max_iter = 1000)

In [29]:
model.fit(X_train, Y_train)

In [39]:
from sklearn.svm import SVC
svc_classifier = SVC(kernel='linear', C=1.0)

# Train SVC classifier
svc_classifier.fit(X_train, Y_train)

In [40]:
#accuracy score on training data
X_train_prediction2 = svc_classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction2)

##### Accuracy Score

In [30]:
#accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [31]:
print("Accuracy score on the training data:", training_data_accuracy*100, "%")

Accuracy score on the training data: 82.9171875 %


In [32]:
#accuracy score on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [33]:
print("Accuracy score on the testing data:", test_data_accuracy*100, "%")

Accuracy score on the testing data: 76.03899756265233 %


##### Model accuracy = 76.03%

**Saving the trained model**

In [34]:
import pickle

In [35]:
filename = "trained_model.sav"
pickle.dump(model, open(filename, 'wb'))  #writing the binary form

Using the saved model for future prediction

In [36]:
#loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [37]:
X_new = X_test[512]
print(Y_test[512])

prediction = model.predict(X_new)
print(prediction)

if prediction[0] == 0:
    print("negative tweet")
else:
    print("Positive Tweet")

1
[1]
Positive Tweet
