In [1]:
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import os
base_folder = os.getcwd()
training_dataset_path = os.path.join(
    base_folder, 
    "trainingandtestdata", 
    "training.1600000.processed.noemoticon.csv")

In [3]:
training_dataset = pd.read_csv(
    training_dataset_path, 
    encoding="latin-1", 
    warn_bad_lines=True,
    error_bad_lines=False,
    header=None, 
    names=["polarity", "id", "date", "query", "user", "tweet"])

In [4]:
training_dataset = training_dataset.sample(10000)
training_dataset

Unnamed: 0,polarity,id,date,query,user,tweet
1014554,4,1881451636,Fri May 22 04:42:11 PDT 2009,NO_QUERY,Ali_xxxxxx,@SunshineeKiid no tmorro at 2:15
1002809,4,1880247798,Fri May 22 00:35:03 PDT 2009,NO_QUERY,peterfabricius,Got his full 7 hours in
1330589,4,2015843049,Wed Jun 03 05:38:20 PDT 2009,NO_QUERY,turtlebisque,Last day of school!
421942,0,2062535957,Sat Jun 06 23:31:09 PDT 2009,NO_QUERY,charismitaine,"Had craftime with @reading_angel, but unfortun..."
1105992,4,1971358938,Sat May 30 06:55:05 PDT 2009,NO_QUERY,EgoPimp,"@Woodeh Yup, just tried and working fine. Pho..."
...,...,...,...,...,...,...
885692,4,1686551513,Sun May 03 05:54:06 PDT 2009,NO_QUERY,peihanko,@Scrawnyy NOT FUNNY RONALD. i got stopped cos ...
177292,0,1965580754,Fri May 29 16:10:22 PDT 2009,NO_QUERY,yogeorge,@morganeleanor ew I know EXACTLY what you're t...
892206,4,1691124764,Sun May 03 17:08:59 PDT 2009,NO_QUERY,rebecca_clemons,gonna go get my hair trimmed. fun
162857,0,1957640696,Fri May 29 01:08:04 PDT 2009,NO_QUERY,kelbdubs,Last day in Dijon


In [5]:
training_dataset.polarity.value_counts()

0    5088
4    4912
Name: polarity, dtype: int64

In [6]:
## Cleansing text
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet).get_text() # extract text from HTML
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', '', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)    
    stopwords_en = stopwords.words("english")
    tweet = [word for word in tweet if word not in stopwords_en] #remove stop words
    return ' '.join(tweet)

training_dataset["cleansed"] = training_dataset.apply(lambda row : clean_tweet(row["tweet"]), axis = 1)

In [7]:
training_dataset

Unnamed: 0,polarity,id,date,query,user,tweet,cleansed
1014554,4,1881451636,Fri May 22 04:42:11 PDT 2009,NO_QUERY,Ali_xxxxxx,@SunshineeKiid no tmorro at 2:15,tmorro 2:15
1002809,4,1880247798,Fri May 22 00:35:03 PDT 2009,NO_QUERY,peterfabricius,Got his full 7 hours in,got full 7 hours
1330589,4,2015843049,Wed Jun 03 05:38:20 PDT 2009,NO_QUERY,turtlebisque,Last day of school!,last day school !
421942,0,2062535957,Sat Jun 06 23:31:09 PDT 2009,NO_QUERY,charismitaine,"Had craftime with @reading_angel, but unfortun...",craftime unfortunate lack made dandy little bo...
1105992,4,1971358938,Sat May 30 06:55:05 PDT 2009,NO_QUERY,EgoPimp,"@Woodeh Yup, just tried and working fine. Pho...","yup , tried working fine . photos downloaded p..."
...,...,...,...,...,...,...,...
885692,4,1686551513,Sun May 03 05:54:06 PDT 2009,NO_QUERY,peihanko,@Scrawnyy NOT FUNNY RONALD. i got stopped cos ...,funny ronald . got stopped cos roadblock.you '...
177292,0,1965580754,Fri May 29 16:10:22 PDT 2009,NO_QUERY,yogeorge,@morganeleanor ew I know EXACTLY what you're t...,ew know exactly 're talking
892206,4,1691124764,Sun May 03 17:08:59 PDT 2009,NO_QUERY,rebecca_clemons,gonna go get my hair trimmed. fun,gon na go get hair trimmed . fun
162857,0,1957640696,Fri May 29 01:08:04 PDT 2009,NO_QUERY,kelbdubs,Last day in Dijon,last day dijon


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
text_counts = cv.fit_transform(training_dataset["cleansed"])
text_counts

<10000x13454 sparse matrix of type '<class 'numpy.int64'>'
	with 68351 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2500) #, min_df=7, max_df=0.8)
vectorized_features = vectorizer.fit_transform(features).toarray()

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(vectorized_features, training_dataset['polarity'], test_size=0.25, random_state=5)

ValueError: Found input variables with inconsistent numbers of samples: [1, 10000]

In [13]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

MultinomialNB()

In [14]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)

In [15]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

72.84%


In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(Y_test,predicted))
print(classification_report(Y_test,predicted))
print(accuracy_score(Y_test, predicted))

[[953 305]
 [374 868]]
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      1258
           4       0.74      0.70      0.72      1242

    accuracy                           0.73      2500
   macro avg       0.73      0.73      0.73      2500
weighted avg       0.73      0.73      0.73      2500

0.7284


In [28]:
X_test

[<2500x13454 sparse matrix of type '<class 'numpy.int64'>'
 	with 16931 stored elements in Compressed Sparse Row format>]

In [19]:
predicted

array([4, 0, 4, ..., 4, 0, 0])

In [80]:
def get_sentiment(tweet):

    prediction = text_classifier.predict(tweet)
    #print(prediction)

In [31]:
li = []
for index, each_tweet in training_dataset.iterrows():
    each_tweet["cleansed"] = clean_tweet(each_tweet["tweet"])
    features = [each_tweet["tweet"]]
    each_tweet["vectorized"] = cv.transform(features)
    each_tweet["predicted"] = MNB.classify(cv.transform([each_tweet["cleansed"]]))
    li.append(each_tweet)
#     print(features)
    
new_dataset = pd.DataFrame(li) 
new_dataset

AttributeError: 'MultinomialNB' object has no attribute 'classify'

In [75]:
training_dataset["cleansed"] = training_dataset.apply(lambda row : clean_twit(row["tweet"]), axis = 1)

In [76]:
training_dataset["vectorized"] = vectorizer.fit_transform(training_dataset["cleansed"])

In [81]:
training_dataset["prediction"] = training_dataset.apply(lambda row : get_sentiment(row["vectorized"]), axis = 1)

KeyboardInterrupt: 

In [86]:
list(training_dataset["vectorized"])[0] 

<10000x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 53223 stored elements in Compressed Sparse Row format>

In [None]:
https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/