In [5]:
%pwd
%cd ..

/home/yukikongju/Projects/tidytuesday/financials_news_sentimentanalysis


In [49]:
import pandas as pd

from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from gensim.models import word2vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Get the data

In [33]:
df = pd.read_csv('data/data.csv')
# df = df[:500]

In [8]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


### Build Preprocessing Pipeline

In [45]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(Normalizer, self).__init__()
        self.stop_words = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, sentences, labels=None):
        return self
    
    def transform(self, sentences):
        return [self.normalize(sentence) for sentence in sentences]
    
    def normalize(self, sentence):
        pass


class StopWordLemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(StopWordLemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            if word not in self.stop_words: 
                words.append(self.lemmatizer.lemmatize(word))
        return ' '.join(words)
    
class LemmerNormalizer(Normalizer):
    
    def __init__(self):
        super(LemmerNormalizer, self).__init__()
        
    def normalize(self, sentence):
        words = []
        for word in sentence.split(' '):
            words.append(self.lemmatizer.lemmatize(word))
        return ' '.join(words)

In [46]:
class OneHotVectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super(OneHotVectorizer, self).__init__()
        self.vectorizer = CountVectorizer(binary=True)
    
    def fit(self, sentences, labels = None):
        return self
    
    def transform(self, sentences):
        freqs = self.vectorizer.fit_transform(sentences)
        return freqs.toarray()


class TFIDFVectorizer(BaseEstimator, TransformerMixin):
    pass


In [None]:
class NGramsVectorizer(BaseEstimator, TransformerMixin): # TODO
    
    def __init__(self, n_grams=2):
        super(NGramsVectorizer, self).__init__()
        self.n_grams = n_grams
    
    def fit(self, sentences, labels=None):
        pass
    
    def transform(self, sentences):
        pass

In [None]:
class Word2VectVectorizer(BaseEstimator, TransformerMixin): # TODO
    
    def __init__(self, n_dim=1000):
        super(Word2VectVectorizer, self).__init__()
        self.n_dim = n_dim
        self.model = None
    
    def fit(self, sentences, labels=None):
        self.model = word2vec.Word2Vec(corpus, vector_size=self.n_dim, window=5, min_count=1)
        return self
    
    def transform(self, sentences):
        pass

In [47]:
pipeline = Pipeline([
    ('normalizer', StopWordLemmerNormalizer()),
    ('vectorizer', OneHotVectorizer())
])

texts = [
    "The S&P500 is up 50 points today",
    "Firm to be buyout in the next quarter"
]
print(pipeline.fit_transform(texts))

[[1 0 0 0 1 1 0 1 1]
 [0 1 1 1 0 0 1 0 0]]


### Split data into training and testing

In [34]:
sentiment_dict = {'positive': 2, 'negative': 0, 'neutral': 1}
y = df['Sentiment'].apply(lambda x: sentiment_dict.get(str(x))).tolist()
X = pipeline.fit_transform(df['Sentence'].tolist()) # FIXME: should I split first, then vectorize?

In [35]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Train Models with TextNorm + OneHotVect pipeline

In [41]:
def get_model_performance(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(classification_report(y_pred, y_test))

In [40]:
get_model_performance(XGBClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.17      0.34      0.22       119
           1       0.87      0.68      0.77      1224
           2       0.58      0.77      0.66       410

    accuracy                           0.68      1753
   macro avg       0.54      0.60      0.55      1753
weighted avg       0.76      0.68      0.71      1753



In [39]:
get_model_performance(GaussianNB(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.38      0.22      0.27       423
           1       0.54      0.70      0.61       752
           2       0.51      0.49      0.50       578

    accuracy                           0.51      1753
   macro avg       0.48      0.47      0.46      1753
weighted avg       0.49      0.51      0.49      1753



In [43]:
get_model_performance(DecisionTreeClassifier(), x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.19      0.17      0.18       272
           1       0.69      0.68      0.69       976
           2       0.60      0.66      0.63       505

    accuracy                           0.59      1753
   macro avg       0.50      0.50      0.50      1753
weighted avg       0.59      0.59      0.59      1753

