In [20]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [21]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="ISO-8859-1",names=["target", "ids", "date", "flag", "user", "text"])
df = df.dropna()
del df["flag"]
del df["user"]
del df["date"]
if (len(df["target"].unique()) == 1):
    del df["target"]
else:
    print(df["target"].unique())
df.head()
train_tweets = df.sample(frac = 0.8)
 
# Creating dataframe with
# rest of the 20% values
test_tweets = df.drop(train_tweets.index)

#We have train and test, now we need to...
# Today's goal: Get a model, train it, test it
df.head()

[0 4]


Unnamed: 0,target,ids,text
0,0,1467810369,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,is upset that he can't update his Facebook by ...
2,0,1467810917,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,my whole body feels itchy and like its on fire
4,0,1467811193,"@nationwideclass no, it's not behaving at all...."


In [22]:
def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

print(form_sentence(train_tweets['text'].iloc[10]))
print(train_tweets['text'].iloc[10])

not even mildly tanning today may need fake bake tan this years just remind me not to go too orange
not even mildly tanning today, may need fake bake tan this years, just remind me not to go too orange 


In [27]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def no_user_alpha(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess
print(no_user_alpha(form_sentence(train_tweets['text'].iloc[10])))
print(train_tweets['text'].iloc[10])

['even', 'mildli', 'tan', 'today', 'may', 'need', 'fake', 'bake', 'tan', 'year', 'remind', 'go', 'orang']
not even mildly tanning today, may need fake bake tan this years, just remind me not to go too orange 


In [47]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def no_user_alpha(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english') and word[0] != '@']
    return clean_mess
print(no_user_alpha(form_sentence(train_tweets['text'].iloc[10])))
print(train_tweets['text'].iloc[10])

['even', 'mildly', 'tanning', 'today', 'may', 'need', 'fake', 'bake', 'tan', 'years', 'remind', 'go', 'orange', 'thomas']
not even mildly tanning today, may need fake bake tan this years, just remind me not to go too orange @thomas


In [42]:
def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
tweet_list = 'I was playing with my friends with whom I used to play, when you called me yesterday'.split()
print(normalization(tweet_list))

['@bomato', 'I', 'be', 'play', 'with', 'my', 'friends', 'with', 'whom', 'I', 'use', 'to', 'play,', 'when', 'you', 'call', 'me', 'yesterday']


In [40]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer='word')),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])


In [34]:
msg_train, msg_test, label_train, label_test = train_test_split(train_tweets['text'], train_tweets['target'], test_size=0.2)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.82      0.75      0.79     70022
           4       0.73      0.80      0.76     57978

    accuracy                           0.77    128000
   macro avg       0.77      0.78      0.77    128000
weighted avg       0.78      0.77      0.78    128000

[[52696 17326]
 [11484 46494]]
0.774921875


In [38]:
label_test.head()

553952     0
1262353    4
739836     0
518296     0
1517863    4
Name: target, dtype: int64