In [2]:
%pwd
%cd ..

/home/yukikongju/Projects/tidytuesday/financials_news_sentimentanalysis


In [76]:
import pandas as pd
import string
import numpy as np
# import imblearn

from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
# from imblearn.over_sample import SMOTE
from sklearn.metrics import classification_report

### Get the Data

In [6]:
df = pd.read_csv('data/data.csv')
# df = df[:500]

In [7]:
df.head(2)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative


### Build Preprocessing Pipeline

In [22]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(Normalizer, self).__init__()
        self.stop_words = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, sentences, labels=None):
        return self
    
    def transform(self, sentences):
        return [self.normalize(sentence) for sentence in sentences]
    
    def normalize(self, sentence):
        pass


class StopWordLemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(StopWordLemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            if word not in self.stop_words and word not in string.punctuation and 'http' not in word:
                words.append(self.lemmatizer.lemmatize(word.lower()))
#         return ' '.join(words)
        return words
    
class LemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(LemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            words.append(self.lemmatizer.lemmatize(word.lower()))
        return ' '.join(words)

In [23]:
class OneHotVectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(OneHotVectorizer, self).__init__()
        self.vectorizer = CountVectorizer(binary=True)
    
    def fit(self, sentences, labels = None):
        return self
    
    def transform(self, sentences):
        freqs = self.vectorizer.fit_transform(sentences)
        return freqs.toarray()


class TFIDFVectorizer(BaseEstimator, TransformerMixin):
    pass


In [52]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, vector_size, window):
        self.vector_size = vector_size
        self.window = window
        
    def fit(self, X, y=None):
        self.model = Word2Vec(sentences=X, vector_size = self.vector_size, 
            window = self.window)
        return self
    
    def transform(self, X, y=None):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model.wv.key_to_index]
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)

### Get Baseline Classifier Performance with Word2Vec Vectorizer

In [67]:
X = df['Sentence']

emotions_map = {'positive': 2, 'negative': 0, 'neutral': 1}
y = df['Sentiment'].apply(lambda x: emotions_map.get(x)) 

In [68]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [71]:
def get_classifier_performance(model, x_train, x_test, y_train, y_test):
    pipeline = Pipeline([
        ('normalizer', StopWordLemmerNormalizer()),
        ('embedding', Word2VecVectorizer(100, 5)),
        ('model', model)
    ])
    pipeline.fit(x_train,y_train)
    y_pred = pipeline.predict(x_test)
    print(classification_report(y_pred, y_test))

In [72]:
get_classifier_performance(XGBClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.06      0.07      0.06       199
           1       0.72      0.57      0.64      1204
           2       0.32      0.53      0.40       350

    accuracy                           0.51      1753
   macro avg       0.37      0.39      0.37      1753
weighted avg       0.57      0.51      0.53      1753



In [73]:
get_classifier_performance(DecisionTreeClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.15      0.09      0.11       373
           1       0.50      0.55      0.52       863
           2       0.37      0.41      0.39       517

    accuracy                           0.41      1753
   macro avg       0.34      0.35      0.34      1753
weighted avg       0.38      0.41      0.39      1753



In [74]:
get_classifier_performance(SGDClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.95      0.57      0.71      1598
           2       0.12      0.44      0.19       155

    accuracy                           0.55      1753
   macro avg       0.36      0.33      0.30      1753
weighted avg       0.88      0.55      0.66      1753



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
get_classifier_performance(GaussianNB(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.38      0.21      0.27       406
           1       0.70      0.59      0.64      1126
           2       0.09      0.24      0.13       221

    accuracy                           0.46      1753
   macro avg       0.39      0.35      0.35      1753
weighted avg       0.55      0.46      0.49      1753



In [77]:
get_classifier_performance(KNeighborsClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.18      0.17      0.17       227
           1       0.74      0.58      0.65      1206
           2       0.24      0.43      0.31       320

    accuracy                           0.50      1753
   macro avg       0.38      0.40      0.38      1753
weighted avg       0.57      0.50      0.53      1753

