<a href="https://colab.research.google.com/github/venkat-krish/DataScienceMaterials/blob/master/NLP02_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import numpy as np

# Data preprocessing libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

from gensim.models import Word2Vec # Word2Vec module
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, remove_stopwords, strip_numeric, stem_text

#Feature engineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
# XGBoost
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data loading

In [4]:
cols = ['Tweet ID', 'Entity', 'Sentiment', 'Tweets']
# Twitter data
twitter_df = pd.read_csv("/content/sample_data/twitter_training.csv", names=cols)

print(twitter_df.head())
twitter_df.info()

   Tweet ID       Entity Sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                              Tweets  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet ID   74682 non-null  int64 
 1   Entity     74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   Tweets     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
twitter_df.isnull().sum()

Tweet ID       0
Entity         0
Sentiment      0
Tweets       686
dtype: int64

In [6]:
# Removing null records
twitter_df = twitter_df.dropna(axis=0)

In [16]:
twitter_df.info()
print(twitter_df['Entity'].value_counts())
twitter_df['Sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73996 entries, 0 to 74681
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet ID   73996 non-null  int64 
 1   Entity     73996 non-null  object
 2   Sentiment  73996 non-null  object
 3   Tweets     73996 non-null  object
 4   Processed  73996 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB
MaddenNFL                            2377
LeagueOfLegends                      2377
CallOfDuty                           2376
Verizon                              2365
TomClancysRainbowSix                 2364
Facebook                             2362
Microsoft                            2361
Dota2                                2359
WorldOfCraft                         2357
ApexLegends                          2353
NBA2K                                2343
CallOfDutyBlackopsColdWar            2343
FIFA                                 2324
johnson&johnson        

Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: Sentiment, dtype: int64

## Data Preprocessing


In [11]:
# Stemmer object
porter = PorterStemmer()
wnl = WordNetLemmatizer()

class DataPreprocess:
    
    def __init__(self):
        self.filters = [strip_tags,
                       strip_numeric,
                      #  strip_punctuation,
                       lambda x: x.lower(),
                        lambda x: re.sub(r'[^a-zA-Z#]', ' ', x),
                      #  lambda x: re.sub(r'\s+\w{1}\s+', '', x),
                       remove_stopwords]
    def __call__(self, doc):
        clean_words = self.__apply_filter(doc)
        return clean_words
    
    def __apply_filter(self, doc):
        try:
            cleanse_words = set(preprocess_string(doc, self.filters))
#             filtered_words = set(wnl.lemmatize(w) if w.endswith('e') or w.endswith('y') else porter.stem(w) for w in cleanse_words)
            return ' '.join(cleanse_words)
        except TypeError as te:
            raise(TypeError("Not a valid data {}".format(te)))

In [13]:
# Clean the text data and make the corpus cleansed
twitter_df['Processed'] = twitter_df['Tweets'].apply(DataPreprocess())

In [15]:
twitter_df['Processed'][0:10]

0                        getting im borderlands murder
1                                  borders coming kill
2                          getting im borderlands kill
3                         im coming borderlands murder
4                        getting im borderlands murder
5                        getting im borderlands murder
6    pc t favorite mlsiwfjg wallpaper enjoy making ...
7    pc t favorite mlsiwfjg wallpaper borderlands k...
8    fun fan hours t favorite huge spent know m cha...
9    rhandlerr pc t favorite mlsiwfjg wallpaper enj...
Name: Processed, dtype: object

## Feature Engineering

In [17]:
def vectorize(vector, X_train, X_test):
    vector_fit = vector.fit(X_train)
    
    X_train_vec = vector_fit.transform(X_train)
    X_test_vec = vector_fit.transform(X_test)
    
    print("Vectorization is completed.")
    return X_train_vec, X_test_vec

def label_encoding(y_train):
    """
        Encode the given list of class labels
        :y_train_enc: returns list of encoded classes
        :labels: actual class labels
    """
    lbl_enc = LabelEncoder()
    
    y_train_enc = lbl_enc.fit_transform(y_train)
    labels = lbl_enc.classes_
    
    return y_train_enc, labels

In [19]:
X = twitter_df['Tweets']
y = twitter_df['Sentiment']

# Label encoding on the classes
y_enc_train, labels = label_encoding(y)

X_train, X_valid, y_train, y_valid = train_test_split(X, y_enc_train, test_size=0.2, shuffle=True)

In [20]:
print(X_train.shape, X_valid.shape)

(59196,) (14800,)


In [22]:

# BOW vectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
X_train_vec, X_valid_vec = vectorize(bow_vectorizer, X_train, X_valid)

Vectorization is completed.


In [25]:
def compute_classweights(target):
    """
    Computes the weights of the target values based on the samples
    :param target: Y-target variable
    :return: dictionary object
    """
    # compute class weights
    class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(target), y=target)
    
    # make the class weight list into dictionary
    weights = {}
    
    # enumerate the list
    for index, weight in enumerate(class_weights):
        weights[index] = weight
        
    return weights

# Get the class weights for the target variable
weights = compute_classweights(y_train)

In [28]:
# XGB classifier
xgb_params = {
    'max_depth': 7,
    'n_estimators': 1000,
    'lambda': 0.01,
    'class_weight': weights
}

xgb_clf = XGBClassifier(**xgb_params)

xgb_clf.fit(X_train_vec, y_train)

XGBClassifier(class_weight={0: 1.431237911025145, 1: 0.82856502995353,
                            2: 1.0221715706589307, 3: 0.8959859538657141},
              lambda=0.01, max_depth=7, n_estimators=1000,
              objective='multi:softprob')

In [29]:
# Prediction on the validation set
y_pred = xgb_clf.predict(X_valid_vec)


print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \tF1-Score: %1.3f\n" % (accuracy_score(y_valid, y_pred), precision_score(y_valid, y_pred, average='micro'), recall_score(y_valid, y_pred, average='micro'), f1_score(y_valid, y_pred, average='micro')))

Accuracy: 0.738 	Precision: 0.738 	Recall: 0.738 	F1-Score: 0.738

