In [1]:
%pwd
%cd ..

/home/yukikongju/Projects/tidytuesday/financials_news_sentimentanalysis


In [58]:
import pandas as pd
import nltk 

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

### Get the data

In [7]:
df = pd.read_csv('data/data.csv')
df = df[:400]

In [8]:
sentiment_dict = {'positive': 2, 'negative': 0, 'neutral': 1}
y = df['Sentiment'].apply(lambda x: sentiment_dict.get(str(x))).tolist()
X = df['Sentence'].tolist() # FIXME: should we split first, then vectorize?

### Build Preprocessing Pipeline

In [77]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(Normalizer, self).__init__()
        self.stop_words = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, sentences, labels=None):
        return self
    
    def transform(self, sentences, labels=None):
        return [self.normalize(sentence) for sentence in sentences]
    
    def normalize(self, sentence):
        pass

class LemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(LemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            words.append(self.lemmatizer.lemmatize(word))
        return ' '.join(words)
    

In [78]:
# class NGramsVectorizer(BaseEstimator, TransformerMixin):  
class NGramsVectorizer(BaseEstimator):    
    
    
    def __init__(self, n_grams=2):
        super(NGramsVectorizer, self).__init__()
        self.n_grams = n_grams
        self.vocab = None
        self.word_dict = None
    
    def fit(self, sentences, labels=None):
        return self
    
    def transform(self, sentences, labels):
        # build vocab
        self._build_vocab(sentences)
        
        # build word dict
        self._build_word_dict()
        
        # get ngram 
        ngrams = []
        for sentence, label in zip(sentences, labels):
            sentence_grams = self._get_ngrams_from_sentence(sentence)
            for gram in sentence_grams:
                gram_idx = [self.word_dict.get(word) for word in gram]
#                 ngrams.append((gram, label))
                ngrams.append((gram_idx, label))
                
        return ngrams
    
    def _get_ngrams_from_sentence(self, sentence):
        """ 
        >>> sentence = "Report suggests that company should go bankrupt by the end of the quarter"
        >>> [['Report', 'suggests'], ['suggests', 'that'], ..,  ['the', 'quarter'] ]
        """
        words = sentence.split(' ')
        ngrams = [words[i:i+n] for i in range(0, len(words)-1)]
        return ngrams
        
    
    def _build_vocab(self, sentences):
        self.vocab = set([word for sentence in sentences for word in sentence.split(' ')])
        
    
    def _build_word_dict(self):
        self.word_dict = { word: i for i, word in enumerate(self.vocab)}

In [68]:
text, n = "Report suggests that company should go bankrupt by the end of the quarter", 2
text = text.split(' ')
ngrams = [text[i:i+n] for i in range(0, len(text)-1)]

In [81]:
vectorizer = NGramsVectorizer()
ngrams_vect = vectorizer.transform(X, y)

In [82]:
# TODO: pipeline

### Split data into Training and testing

In [94]:
ngrams_x, ngrams_y = [], []
for ngram_x, ngram_y in ngrams_vect:
    ngrams_x.append(ngram_x)
    ngrams_y.append(ngram_y)


x_train, x_test, y_train, y_test = train_test_split(ngrams_x, ngrams_y, test_size = 0.3)

### Train Models

In [95]:
def get_model_performance(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(classification_report(y_pred, y_test))

In [101]:
get_model_performance(XGBClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.02      0.15      0.04        48
           1       0.88      0.61      0.72      2051
           2       0.14      0.33      0.19       265

    accuracy                           0.57      2364
   macro avg       0.35      0.36      0.32      2364
weighted avg       0.78      0.57      0.65      2364



In [96]:
get_model_performance(GaussianNB(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.60      0.75      2364
           2       0.00      0.00      0.00         0

    accuracy                           0.60      2364
   macro avg       0.33      0.20      0.25      2364
weighted avg       1.00      0.60      0.75      2364



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
get_model_performance(DecisionTreeClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.16      0.14      0.15       367
           1       0.60      0.62      0.61      1374
           2       0.28      0.29      0.28       623

    accuracy                           0.46      2364
   macro avg       0.35      0.35      0.35      2364
weighted avg       0.45      0.46      0.45      2364

