### Libraries and Dataset

In [131]:
import numpy as np
import pandas as pd # data processing

import re
import string

import nltk #NLP
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import word_tokenize
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import random
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/xuan1113/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [132]:
# Read in dataset.
raw_data = pd.read_csv('New York sues Donald Trump, company and family members over widespread fraud claims, seeks at least $250 million in penalties.csv')
raw_data.head(5)

Unnamed: 0,comment
0,TLDR: \n\n+ Civil Suit\n+ Alleges Trump of fra...
1,">\tMar-a-Lago ""generated less than $25 million..."
2,I don’t quite understand how someone can be af...
3,The same people who stole money from cancer ch...
4,And this is how he'll be forced to admit he's ...


### Data Preprocessing

In [133]:
# Drop NAs
raw_data = raw_data.dropna()

In [134]:
# Define functions for cleaning.
def preprocess_text(text:str):
    """
    This utility function normalize a string by:
    - transforming in lowercase
    - expanding contractions
    - removing Usernames
    - removing links
    - removing hashtags
    - removing special/single characters
    - removing numbers 
    - removing multiple whitespaces
    - removing stopwords   
    - Stemming : suffix stripping
    - Lemmatization : get the root word
    
    
    Args:
        text (str): the input text you want to clean
        remove_stopwords (bool): whether or not to remove stopwords
    Returns:
        str: the cleaned text
    """

    # Convert all characters to lowercase
    text = text.lower()
    # Replacing contractions
    text = contractions.fix(text)
    # Remove username after @
    text = re.sub(r'@[^\s]+','',text)
    # Remove URLs
    text = re.sub(r"http\S+", '', text)
    # Remove hashtags
    text = re.sub(r'#[^\s]+', '', text)
    # Replace all the special characters and numbers with space
    text = re.sub(r"[^A-Za-z]+", ' ',text)
    # Remove all single characters
    text = re.sub(r'\s[a-zA-Z]\s', ' ',text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # Stemming / Lemmatization / Remove stop words
    # Tokenize : breaking down sentence into word token.
    tokens = word_tokenize(text)
    # Check for stop words
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(w) for w in tokens]
    # Lemmatization
    wnetl = WordNetLemmatizer()
    tokens = [wnetl.lemmatize(w) for w in tokens]
    # Join cleaned words back together.
    text = " ".join(tokens)

    return text


In [135]:
def add_sentiment(text:str):
    """
    Function extract sentiment score from a text, value range from -1 to 1,
    where 1 is the most positive value, -1 otherwise.

    Args:
        text(str): input text to parse.
    
    Returns:
        sentiment(str): Positive or Negative.
    """
    sia = SIA()
    if sia.polarity_scores(text)['compound'] > 0 :
        return 'POSITIVE'
    elif sia.polarity_scores(text)['compound'] == 0:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'


In [136]:
class Preprocessor(BaseEstimator, TransformerMixin):
    """
    This class is used as a pipeline and is inheriting from sklearn BaseEstimator and TransformerMixin
    This allows us to customize transforms and the final estimator that sklearn does not offer.
    We need to build this class to clean our train and test data as a pipeline transform.
    """
    def __init__(self):
        return None

    def fit(self, X=None):
        return self

    def transform(self, X=None):
        cleaned = X.apply(lambda x:preprocess_text(x))

        return cleaned


In [137]:
class SentimentAnalysis(BaseEstimator, TransformerMixin):
    """
    Class used to generate sentiment.
    """
    def __init__(self):
        return None

    def fit(self, X=None):
        return self
        
    def transform(self, X=None):
        sentiment = X.apply(lambda x: add_sentiment(x))
        return {'text':X,'sentiment':sentiment}

In [138]:
# Pipeline implementation
preprocess_pipeline = Pipeline(steps=[
    ('preprocess', Preprocessor()), # first step is to preprocess the text 
    ('sentiment', SentimentAnalysis()) # second step obtains the sentiment of the text
])

final = pd.DataFrame(preprocess_pipeline.fit_transform(X=raw_data['comment']))
final.head()

Unnamed: 0,text,sentiment
0,tldr civil suit alleg trump fraud span year in...,NEUTRAL
1,mar lago gener le million annual revenu suit s...,NEGATIVE
2,quit understand someon affili mani convict peo...,NEUTRAL
3,peopl stole money cancer chariti would never c...,NEUTRAL
4,forc admit broke,NEUTRAL


### Feature Engineering

In [139]:
# Explore the data frame.
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       85 non-null     object
 1   sentiment  85 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [140]:
final['sentiment'].value_counts()

NEUTRAL     33
POSITIVE    29
NEGATIVE    23
Name: sentiment, dtype: int64

+ Now we have 2 columns in the dataframe - text and sentiment. Sentiment is the target column to predic. We only need to predict positive and negative sentiment, so neutral sentiment will be removed from the dataframe. The dataset is unbalanced as it contains more number of positive text compare to negative ones, we need to convert adjust them to even amount to make sure our model won't bias.

In [141]:
# Removing Neutral sentiments.
final = final[final.sentiment != 'NEUTRAL']

# Randomly drop 6 positive sentiments to have even number of positive and negative data.
final.drop(final[final['sentiment']=='POSITIVE'].sample(n=6).index, inplace=True)

final['sentiment'].value_counts()


NEGATIVE    23
POSITIVE    23
Name: sentiment, dtype: int64

In [142]:
# Convert labels to numeric forms.
final['label'] = final['sentiment'].replace({'POSITIVE':1,'NEGATIVE':0})
final.head()

Unnamed: 0,text,sentiment,label
1,mar lago gener le million annual revenu suit s...,NEGATIVE,0
5,bank loan money without due dilig scrutini bla...,POSITIVE,1
9,million,NEGATIVE,0
10,one adult kid go turn donald trump first money...,NEGATIVE,0
13,great businessman need illeg pac illeg chariti...,POSITIVE,1


### Train, test, split