In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE

from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Get the Data

In [2]:
df = pd.read_csv('../data/data.csv')
df.head(2)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative


In [28]:
len(df)

5842

In [59]:
print(df['Sentiment'].value_counts())

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64


### Vectorize Sentence with Topic Modelling

In [51]:
class PCATopicVectorizer(BaseEstimator, TransformerMixin):

    """ 
    Transform co-occurence matrix / TF-IDF into topic vectors 

    Examples
    --------
    >>> documents = ["I am writing", "The word is short"]
    >>> topic_vectorizer = PCATopicVectorizer(TfidfVectorizer(), n_topics = 6)
    >>> df = topic_vectorizer.fit_transform(documents)
    >>>             topic0  topic1  topic2  topic3  topic4  topic5
    ...     rather  0.0044  0.0951 -0.1192  0.2258 -0.1503 -0.0067
    >>> df size: num words x num topics
    """

    def __init__(self, vectorizer, n_topics):
        self.vectorizer = vectorizer
        self.n_topics = n_topics
        self.pca = PCA(n_components=n_topics)

    def fit(self, X, y=None):
        return self
        
    def transform(self, documents):
        # get co-occurence matrix with vectorizer
        documents_vectorized = self.vectorizer.fit_transform(documents)
        occ_mat = documents_vectorized.toarray()
        word_dict = self.vectorizer.vocabulary_
        vocab = sorted(self.vectorizer.vocabulary_.keys())

        # get topic vetor with pca (dimension reduction)
        pca_topic_vectors = self.pca.fit_transform(occ_mat)
#         weights = pd.DataFrame(self.pca.components_.round(4), columns=word_dict, 
#                 index=[f'topic{i}' for i in range(self.n_topics)])
#         return weights.T
        return pd.DataFrame(pca_topic_vectors)
        
class TruncatedSVDTopicVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, vectorizer, n_topics):
        self.vectorizer = vectorizer
        self.n_topics = n_topics
        self.svd = TruncatedSVD(n_components=n_topics)

    def fit(self, X):
        return self


    def transform(self, documents):
        # get co-occurence matrix with vectorizer
        documents_vectorized = self.vectorizer.fit_transform(documents)
        occ_mat = documents_vectorized.toarray()
        word_dict = self.vectorizer.vocabulary_
        vocab = sorted(self.vectorizer.vocabulary_.keys())

        # get topic vetor with pca (dimension reduction)
        topic_vectors = self.svd.fit_transform(occ_mat)
#         weights = pd.DataFrame(self.svd.components_.round(4), columns=word_dict, 
#                 index=[f'topic{i}' for i in range(self.n_topics)])
#         return weights.T
        return pd.DataFrame(topic_vectors)


In [12]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(Normalizer, self).__init__()
#         self.stop_words = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, sentences, labels=None):
        return self
    
    def transform(self, sentences):
        return [self.normalize(sentence) for sentence in sentences]
    
    def normalize(self, sentence):
        pass
    
class LemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(LemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            words.append(self.lemmatizer.lemmatize(word))
        return ' '.join(words)

In [33]:
vectorizer = PCATopicVectorizer(TfidfVectorizer(), 6)
vectorizer.fit_transform(df['Sentence'])

Unnamed: 0,0,1,2,3,4,5
0,-0.080547,-0.047560,-0.002001,-0.023299,-0.036874,-0.017216
1,-0.044806,-0.084806,0.052420,0.016438,0.024691,0.010859
2,0.088171,0.146225,0.075232,0.089299,-0.026032,0.075103
3,0.002422,0.086496,-0.103460,-0.057106,-0.022553,0.010245
4,-0.058007,0.031016,-0.042545,0.002675,-0.040885,-0.022565
...,...,...,...,...,...,...
5837,-0.068024,-0.056550,0.030989,0.007151,0.003565,0.000157
5838,-0.053824,-0.054168,0.022841,0.013444,-0.026409,0.018824
5839,-0.079462,-0.006293,-0.044158,-0.034532,0.021192,-0.028905
5840,0.196840,-0.056173,-0.043662,0.009098,-0.146317,0.023787


In [46]:
pipeline = Pipeline([
    ('normalizer', LemmerNormalizer()),
    ('vectorizer', PCATopicVectorizer(TfidfVectorizer(), 8))
])

### Split Training and Testing Set

In [47]:
sentiment_dict = {'positive': 2, 'negative': 0, 'neutral': 1}
y = df['Sentiment'].apply(lambda x: sentiment_dict.get(str(x))).tolist()
X = pipeline.fit_transform(df['Sentence']) # FIXME: should I split first, then vectorize?

In [48]:
X

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.082162,-0.049403,-0.006066,-0.038137,-0.033010,-0.005078,-0.003851,-0.047816
1,-0.045003,-0.088103,0.051362,0.019791,0.021702,0.008366,-0.005406,0.047180
2,0.100500,0.149550,0.085529,0.089502,-0.038040,0.041604,0.036000,0.164709
3,0.003943,0.113428,-0.117405,-0.059986,-0.011854,0.005935,0.033674,-0.009274
4,-0.059110,0.031338,-0.036604,-0.008433,-0.061202,0.003634,0.039707,-0.011427
...,...,...,...,...,...,...,...,...
5837,-0.070881,-0.058412,0.028664,0.006910,-0.011128,-0.018385,0.043639,0.000239
5838,-0.050023,-0.055133,0.022763,0.008235,-0.016335,0.021596,-0.057037,-0.007175
5839,-0.080865,-0.009503,-0.046569,-0.030033,0.016156,-0.034531,-0.003830,0.054906
5840,0.198316,-0.054323,-0.049095,-0.031664,-0.112926,0.078606,-0.117437,-0.163252


In [49]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [82]:
print(len(x_test), len(y_test))

1753 1753


### Resample with SMOTE

In [66]:
print(pd.DataFrame(y_train).value_counts())

1    2197
2    1275
0     617
dtype: int64


In [74]:
sm = SMOTE(random_state=42)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

In [75]:
print(pd.DataFrame(y_train_res).value_counts())

0    2197
1    2197
2    2197
dtype: int64


### Training the model

In [50]:
def get_model_performance(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(classification_report(y_pred, y_test))

In [76]:
get_model_performance(XGBClassifier(), x_train_res, x_test, y_train_res, y_test)

              precision    recall  f1-score   support

           0       0.21      0.14      0.17       360
           1       0.67      0.65      0.66       954
           2       0.43      0.56      0.49       439

    accuracy                           0.52      1753
   macro avg       0.43      0.45      0.44      1753
weighted avg       0.51      0.52      0.52      1753



In [77]:
get_model_performance(SGDClassifier(), x_train_res, x_test, y_train_res, y_test)

              precision    recall  f1-score   support

           0       0.59      0.26      0.36       554
           1       0.84      0.66      0.74      1179
           2       0.02      0.50      0.03        20

    accuracy                           0.53      1753
   macro avg       0.48      0.47      0.38      1753
weighted avg       0.75      0.53      0.61      1753



In [78]:
get_model_performance(GaussianNB(), x_train_res, x_test, y_train_res, y_test)

              precision    recall  f1-score   support

           0       0.30      0.23      0.26       323
           1       0.64      0.71      0.67       833
           2       0.47      0.46      0.47       597

    accuracy                           0.54      1753
   macro avg       0.47      0.47      0.47      1753
weighted avg       0.52      0.54      0.53      1753



In [79]:
get_model_performance(LogisticRegression(), x_train_res, x_test, y_train_res, y_test)

              precision    recall  f1-score   support

           0       0.47      0.28      0.35       418
           1       0.66      0.72      0.69       855
           2       0.38      0.45      0.41       480

    accuracy                           0.54      1753
   macro avg       0.50      0.48      0.48      1753
weighted avg       0.54      0.54      0.53      1753



In [80]:
get_model_performance(DecisionTreeClassifier(), x_train_res, x_test, y_train_res, y_test)

              precision    recall  f1-score   support

           0       0.21      0.11      0.15       441
           1       0.52      0.58      0.55       834
           2       0.39      0.47      0.43       478

    accuracy                           0.43      1753
   macro avg       0.37      0.39      0.37      1753
weighted avg       0.41      0.43      0.42      1753

