In [None]:
#Importing required libraries
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings 
warnings.filterwarnings("ignore")

In [None]:
#Read the dataset
data = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv')

In [None]:
#View the top rows
data.head()

In [None]:
#View DataFrame Columns
data.columns

In [None]:
#Rename DataFrame Columns
DATASET_COLUMNS = ["TARGET", "ID", "DATE", "FLAG", "USER", "TWEET"]
data.columns = DATASET_COLUMNS
data.head()

In [None]:
data.DATE = data.DATE.str.replace('2009','2019')

In [None]:
data.head()

In [None]:
data.to_csv('Data_Before_Cleaning_Sentiment.csv', index=False)

In [None]:
data.dtypes

In [None]:
data.drop('FLAG',axis = 1,inplace = True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.TARGET.value_counts()

In [None]:
positif_data = data[data.TARGET==4].iloc[:40000,:]
print(positif_data.shape)
negative_data = data[data.TARGET==0].iloc[:20000,:]
print(negative_data.shape)

In [None]:
data = pd.concat([positif_data,negative_data],axis=0)
data.reset_index(drop=True,inplace=True)
data.head()

In [None]:
data.TARGET.value_counts()

# Data Cleaning
* The Twitter handles are already masked as @user due to privacy concerns. So, these Twitter handles are hardly giving any information about the nature of the tweet.
* We can also think of getting rid of the punctuations, numbers and even special characters since they wouldn’t help in differentiating different kinds of tweets.
* Most of the smaller words do not add much value. For example, ‘pdx’, ‘his’, ‘all’. So, we will try to remove them as well from our data.
* Once we have executed the above three steps, we can split every tweet into individual words or tokens which is an essential step in any NLP task.
* In the 4th tweet, there is a word ‘love’. We might also have terms like loves, loving, lovable, etc. in the rest of the data. These terms are often used in the same context. If we can reduce them to their root word, which is ‘love’, then we can reduce the total number of unique words in our data without losing a significant amount of information.

In [None]:
#Removing Twitter Handles
data['CLEAN_TWEET'] = data.TWEET.str.replace('@','')
data.head()

In [None]:
#Removing URL Links
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].str.replace(r"http\S+", ' ')
data.head()

In [None]:
#Removing Punctuations and Numbers
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].str.replace('[^a-zA-Z]',' ')
data.head()

In [None]:
#Removing Stop Words
stopwords = nltk.corpus.stopwords.words('english')

Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

We would not want these words to take up space in our database, or taking up valuable processing time. For this, we can remove them easily, by storing a list of words that you consider to stop words. NLTK(Natural Language Toolkit) in python has a list of stopwords stored in 16 different languages.

In [None]:
def change(text):
    clean_text = [item for item in text.split() if item not in stopwords]
    return ' '.join(clean_text)
    
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].apply(lambda text: change(text.lower()))
data.head()

# Tokenization
Tokens are individual terms or words, and tokenization is the process of splitting a string of text into tokens.

In [None]:
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].apply(lambda text: text.split())
data.head()

# Stemming
* Stemming is a rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word. 
* For example – “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”.

In [None]:
from nltk.stem.porter import * 
stemmer = PorterStemmer() 
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].apply(lambda text: [stemmer.stem(item) for item in text])
data.head()

In [None]:
#Now let’s stitch these tokens back together.
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].apply(lambda text: ' '.join([item for item in text]))
data.head()

* Now we’ll add columns to the original DataFrame to store polarity_score dictionaries, extracted compound scores, and new “pos/neg” labels derived from the compound score. 
* We’ll use this last column to perform an accuracy test. The reviews in this method will be classified into negative, positive and, neutral ratio.

In [None]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
'''#Calculating Negative, Positive, Neutral and Compound values
data[['polarity', 'subjectivity']] = data['CLEAN_TWEET'].apply(lambda text: pd.Series(TextBlob(text).sentiment))

for index, row in data['CLEAN_TWEET'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    if neg > pos:
        data.loc[index, 'sentiment'] = 1 #Negative
    elif pos > neg:
        data.loc[index, 'sentiment'] = 1 #Positive
    else:
        data.loc[index, 'sentiment'] = 2 #Neutral
        
    data.loc[index, 'neg'] = neg
    data.loc[index, 'neu'] = neu
    data.loc[index, 'pos'] = pos
    data.loc[index, 'compound'] = comp
    
data.to_csv('twitter_sentiment.csv',index=False)'''

In [None]:
data = pd.read_csv('../input/handson/twitter_sentiment.csv')
data.head()

In [None]:
data['CLEAN_TWEET']=data['CLEAN_TWEET'].fillna('')

* We have to be a little careful here in selecting the length of the words which we want to remove. So, we have decided to remove all the words having length 3 or less. 
* For example, terms like “hmm”, “oh” are of very little use. It is better to get rid of them.

In [None]:
data['CLEAN_TWEET'] = data['CLEAN_TWEET'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
data.head()

In [None]:
data.sentiment = data.sentiment.astype(int)
data.dtypes

In [None]:
data.sentiment.value_counts()

In [None]:
from wordcloud import WordCloud

Word Cloud is a data visualization technique used for representing text data in which the size of each word indicates its frequency or importance. Significant textual data points can be highlighted using a word cloud. Word clouds are widely used for analyzing data from social network websites.

In [None]:
positive_words = ' '.join([text for text in data.CLEAN_TWEET[data.sentiment==1]])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(positive_words) 

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear");

In [None]:
from collections import Counter
import plotly.express as px

In [None]:
top = Counter([item for item in positive_words.split()])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']

fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Positive Words')
fig

In [None]:
negative_words = ' '.join([text for text in data.CLEAN_TWEET[data.sentiment==0]])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_words) 

plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear");

In [None]:
top = Counter([item for item in negative_words.split()])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']

fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Negative Words')
fig

In [None]:
neutral_words = ' '.join([text for text in data.CLEAN_TWEET[data.sentiment==2]])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(neutral_words) 

plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear");

In [None]:
top = Counter([item for item in neutral_words.split()])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.head()

In [None]:
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree of Most Common Neutral Words')
fig

In [None]:
sns.countplot(x='sentiment',data=data);

# NLP Classification Task

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report

* Bag of words is a commonly used model in Natural Language Processing. 
* The idea behind this model is the creation of vocabulary that contains the collection of different words, and each word is associated with a count of how it occurs. 
* Later, the vocabulary is used to create d-dimensional feature vectors.

In [None]:
x = data.CLEAN_TWEET
y = data.sentiment

* CountVectorizer is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text. This is helpful when we have multiple such texts, and we wish to convert each word in each text into vectors.
* CountVectorizer creates a matrix in which each unique word is represented by a column of the matrix, and each text sample from the document is a row in the matrix. The value of each cell is nothing but the count of the word in that particular text sample. 

In [None]:
count_vectorizer = CountVectorizer() 
cv = count_vectorizer.fit_transform(x)
cv.shape

* Train Dataset: We use datasets to train the model using various machine learning algorithms. Training a model is required so that it can understand the various patterns, rules, and, features.
* Test Dataset: Once our machine learning model has been trained on a given dataset, then we test the model. In this step, we check for the accuracy of our model by providing a test dataset to it.

In [None]:
#Let's split our data into training and testing data.
xtrain,xtest,ytrain,ytest = train_test_split(cv, y,test_size=0.2,random_state=101)

# Training a Model

* Naive Bayes is based on Bayes’ theorem, where the adjective Naïve says that features in the dataset are mutually independent. 
* Occurrence of one feature does not affect the probability of occurrence of the other feature. 
* For small sample sizes, Naïve Bayes can outperform the most powerful alternatives. 
* Being relatively robust, easy to implement, fast, and accurate, it is used in many different fields.

In [None]:
nb = MultinomialNB()

In [None]:
nb.fit(xtrain,ytrain)

# Predictions and Evaluations

In [None]:
predictions = nb.predict(xtest)

A confusion matrix is a table that is often used to describe the performance of a classification model on a set of test data for which the true values are known. 

Let's now define the most basic terms:
* true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
* true negatives (TN): We predicted no, and they don't have the disease.
* false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
* false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")

In [None]:
print(confusion_matrix(ytest,predictions))
print('\n')
print(classification_report(ytest,predictions))

In [None]:
print(accuracy_score(predictions,ytest))

* The Random forest is a supervised Machine learning algorithm used for classification, regression, and other tasks using decision trees.
* The Random forest classifier creates a set of decision trees from a randomly selected subset of the training set. 
* It is basically a set of decision trees (DT) from a randomly selected subset of the training set and then It collects the votes from different decision trees to decide the final prediction.

In [None]:
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(xtrain,ytrain)
predictions = rf.predict(xtest)
print(accuracy_score(predictions,ytest))

Pipelines are used for splitting up your machine learning workflows into independent, reusable, modular parts that can then be pipelined together to continuously improve the accuracy of the model and achieve a successful algorithm.

Pipeline will include the following steps:
1. Preprocessing Text and Building Vocabulary: Removing unwanted texts (stop words), punctuations, URLs, handles, etc. which do not have any sentimental value. And then adding unique preprocessed words to a vocabulary.
2. Feature Extraction: Iterating through each data example to extract features using a frequency dictionary and finally create a feature matrix.
3. Training Model: We’ll then use our feature matrix to train a Logistic Regression model in order to use that model for predicting sentiments.
4. Testing Model: Using our trained model to get the predictions from data it never saw.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('classifier', MultinomialNB()),  # train on Naive Bayes classifier
])

In [None]:
x = data.CLEAN_TWEET
y = data.sentiment
xtrain,xtest,ytrain,ytest = train_test_split(x, y,test_size=0.3,random_state=62)

In [None]:
pipeline.fit(xtrain,ytrain)

* A Classification report is used to measure the quality of predictions from a classification algorithm. How many predictions are True and how many are False. 
* More specifically, True Positives, False Positives, True negatives and False Negatives are used to predict the metrics of a classification report as shown below.

In [None]:
predictions = pipeline.predict(xtest)
print(confusion_matrix(ytest,predictions))
print(classification_report(ytest,predictions))

* Pickle is the standard way of serializing objects in Python.
* You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.
* Later you can load this file to deserialize your model and use it to make new predictions.

In [None]:
import joblib
Dump the pipeline model
joblib.dump(pipeline,'Sentiment')

In [None]:
#Test the pipeline with a sample tweet
pipeline.predict(['killer'])