In [60]:
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
import os
base_folder = os.getcwd()
training_dataset_path = os.path.join(
    base_folder, 
    "trainingandtestdata", 
    "training.1600000.processed.noemoticon.csv")

In [62]:
training_dataset = pd.read_csv(
    training_dataset_path, 
    encoding="latin-1", 
    warn_bad_lines=True,
    error_bad_lines=False,
    header=None, 
    names=["polarity", "id", "date", "query", "user", "tweet"])

In [63]:
training_dataset = training_dataset.sample(10000)
training_dataset

Unnamed: 0,polarity,id,date,query,user,tweet
1275757,4,2001124104,Mon Jun 01 23:54:51 PDT 2009,NO_QUERY,lucytaylor123,"waiting for lulu, while i wait im looking on y..."
10947,0,1551136705,Sat Apr 18 08:14:47 PDT 2009,NO_QUERY,sdohana,bummed we're not in puerto rico rappelling dow...
466283,0,2175529740,Mon Jun 15 01:14:45 PDT 2009,NO_QUERY,triiishh,@eripeng Hahaha. To tweet is hard hard work.
350408,0,2017828702,Wed Jun 03 09:00:54 PDT 2009,NO_QUERY,louisewaves,@takebrokenme Poor u! U had enough fluids &a...
722368,0,2261462524,Sat Jun 20 20:49:47 PDT 2009,NO_QUERY,felecia91,is super frustrated!!! *-* Needs God's help de...
...,...,...,...,...,...,...
1539685,4,2180202998,Mon Jun 15 10:08:56 PDT 2009,NO_QUERY,TheAngelPandora,Wishing some of my friends had twitter so I ca...
394557,0,2055662923,Sat Jun 06 09:43:45 PDT 2009,NO_QUERY,sweet0nes,Locked my keys in the trunk@walmart! We r wait...
1211470,4,1989155805,Mon Jun 01 00:31:56 PDT 2009,NO_QUERY,ElsaEnah,just got homeIsawI love you man ... very funny...
976352,4,1833625317,Sun May 17 23:54:15 PDT 2009,NO_QUERY,cdub4,@DwightHoward congrats!


In [64]:
training_dataset.polarity.value_counts()

0    5035
4    4965
Name: polarity, dtype: int64

In [65]:
## Cleansing text
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet).get_text() # extract text from HTML
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', '', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)    
    stopwords_en = stopwords.words("english")
    tweet = [word for word in tweet if word not in stopwords_en] #remove stop words
    return ' '.join(tweet)

training_dataset["cleansed"] = training_dataset.apply(lambda row : clean_tweet(row["tweet"]), axis = 1)

In [66]:
training_dataset

Unnamed: 0,polarity,id,date,query,user,tweet,cleansed
1275757,4,2001124104,Mon Jun 01 23:54:51 PDT 2009,NO_QUERY,lucytaylor123,"waiting for lulu, while i wait im looking on y...","waiting lulu , wait im looking youtube cool st..."
10947,0,1551136705,Sat Apr 18 08:14:47 PDT 2009,NO_QUERY,sdohana,bummed we're not in puerto rico rappelling dow...,"bummed 're puerto rico rappelling waterfall , ..."
466283,0,2175529740,Mon Jun 15 01:14:45 PDT 2009,NO_QUERY,triiishh,@eripeng Hahaha. To tweet is hard hard work.,hahaha . tweet hard hard work .
350408,0,2017828702,Wed Jun 03 09:00:54 PDT 2009,NO_QUERY,louisewaves,@takebrokenme Poor u! U had enough fluids &a...,poor u ! u enough fluids & salts ? want leave ...
722368,0,2261462524,Sat Jun 20 20:49:47 PDT 2009,NO_QUERY,felecia91,is super frustrated!!! *-* Needs God's help de...,super frustrated ! ! ! * - * needs god 's help...
...,...,...,...,...,...,...,...
1539685,4,2180202998,Mon Jun 15 10:08:56 PDT 2009,NO_QUERY,TheAngelPandora,Wishing some of my friends had twitter so I ca...,wishing friends twitter bug
394557,0,2055662923,Sat Jun 06 09:43:45 PDT 2009,NO_QUERY,sweet0nes,Locked my keys in the trunk@walmart! We r wait...,locked keys trunk r waiting 4 locksmith . $ 55...
1211470,4,1989155805,Mon Jun 01 00:31:56 PDT 2009,NO_QUERY,ElsaEnah,just got homeIsawI love you man ... very funny...,got homeisawi love man ... funny movie
976352,4,1833625317,Sun May 17 23:54:15 PDT 2009,NO_QUERY,cdub4,@DwightHoward congrats!,congrats !


In [67]:
features = training_dataset.iloc[:, 6].values
labels = training_dataset.iloc[:, 0].values

In [68]:
labels

array([4, 0, 0, ..., 4, 4, 4])

In [69]:
features

array(['waiting lulu , wait im looking youtube cool stuff ummmmm',
       "bummed 're puerto rico rappelling waterfall , zip lining bungee jumping couple boo coming back early .",
       'hahaha . tweet hard hard work .', ...,
       'got homeisawi love man ... funny movie', 'congrats !',
       'finished 98 name badges brides networking tomorrow ! see everyone'],
      dtype=object)

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2500) #, min_df=7, max_df=0.8)
vectorized_features = vectorizer.fit_transform(features).toarray()

In [75]:
vectorizer.get_feature_names()

['00',
 '09',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '13',
 '14',
 '140',
 '15',
 '16',
 '17',
 '18',
 '1st',
 '20',
 '200',
 '2009',
 '22',
 '23',
 '24',
 '25',
 '2day',
 '2morrow',
 '2nd',
 '30',
 '31',
 '33',
 '360',
 '3am',
 '3d',
 '3g',
 '3rd',
 '40',
 '45',
 '4th',
 '50',
 '5am',
 '5th',
 '60',
 '7pm',
 '80',
 'able',
 'absolutely',
 'abt',
 'ac',
 'accident',
 'according',
 'account',
 'accounts',
 'ace',
 'ache',
 'across',
 'act',
 'acting',
 'action',
 'actor',
 'actual',
 'actually',
 'ad',
 'adam',
 'add',
 'added',
 'addicted',
 'addiction',
 'admit',
 'adorable',
 'ads',
 'advance',
 'advice',
 'afraid',
 'afternoon',
 'ages',
 'ago',
 'agree',
 'agreed',
 'ah',
 'aha',
 'ahaha',
 'ahead',
 'ahh',
 'ahhh',
 'ahhhh',
 'ai',
 'aim',
 'aint',
 'air',
 'airport',
 'album',
 'alcohol',
 'alex',
 'alive',
 'all',
 'allergic',
 'allowed',
 'almost',
 'alone',
 'along',
 'alot',
 'already',
 'alright',
 'also',
 'although',
 'always',
 'amanda',
 'amazing',
 'amber',
 'amen',
 'a

In [76]:
vectorized_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [77]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectorized_features, labels, test_size=0.2, random_state=0)

In [78]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [79]:
predictions = text_classifier.predict(X_test)

In [80]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [81]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[723 275]
 [286 716]]
              precision    recall  f1-score   support

           0       0.72      0.72      0.72       998
           4       0.72      0.71      0.72      1002

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000

0.7195


In [82]:
predictions

array([0, 4, 0, ..., 0, 0, 4])

In [83]:
# text_classifier.classify("'noel says holla may somethin ya .. & yes shall b pickin ur ass atl airport t-minus 24 days !'")

In [80]:
def get_sentiment(tweet):

    prediction = text_classifier.predict(tweet)
    #print(prediction)

In [84]:
li = []
for index, each_tweet in training_dataset.iterrows():
    each_tweet["cleansed"] = clean_tweet(each_tweet["tweet"])
    features = [each_tweet["tweet"]]
    each_tweet["vectorized"] = vectorizer.fit_transform(features)
    each_tweet["prediction"] = text_classifier.classify(each_tweet["vectorized"])
    li.append(each_tweet)
#     print(features)
    
new_dataset = pd.DataFrame(li) 
new_dataset

AttributeError: 'RandomForestClassifier' object has no attribute 'classify'

In [86]:
list(training_dataset["vectorized"])[0] 

<10000x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 53223 stored elements in Compressed Sparse Row format>

In [None]:
https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/

In [57]:
test_dataset_path = os.path.join(
    base_folder, 
    "trainingandtestdata", 
    "testdata.manual.2009.06.14.csv"    
)

test_dataset = pd.read_csv(
    test_dataset_path, 
    encoding="latin-1", 
    warn_bad_lines=True,
    error_bad_lines=False,
    header=None, 
    names=["polarity", "id", "date", "query", "user", "tweet"])

In [59]:
test_dataset

Unnamed: 0,polarity,id,date,query,user,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."
