In [14]:
import gensim.models
import gensim.corpora
import gensim as gs
import pandas as pd
from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser
from joblib import dump, load
import numpy as np
import nltk
from nltk.stem.porter import *
import numpy as np
import spacy
nlp = spacy.load("en_core_web_lg")
stemmer = nltk.stem.SnowballStemmer('english')

In [64]:
def split_punct(text):
    replacement = [(".", " . "), (",", " , "), ("!", " ! "), ("?", " ? ")]
    for k, v in replacement: 
        text = text.replace(k, v)
    return text
    
def get_emotion_words(stems, list_of_lexica):
    emotion_words = np.zeros(4)
    for index, lexicon in enumerate(list_of_lexica): 
        for stem in stems:
            if stem in lexicon:
                emotion_words[index] = emotion_words[index] + 1
    return emotion_words

def get_cons_punct_count(pos):
    cons_punct_count = 0
    for index, item in enumerate(pos[:-1]):
        if item == 97 and item == pos[index+1]:
            cons_punct_count += 1
    return cons_punct_count

def extract_features(list_of_lexica, input_message, seq_len): 
    doc = nlp(split_punct(input_message))
    doc = nlp(" ".join([token.text for token in doc if not token.is_stop and token.pos != 103]))
    if len(doc) != 0:
        pos = [token.pos for token in doc]
        stems = [stemmer.stem(token.text) for token in doc if token.pos != 97]
        emotion_words = get_emotion_words(stems, list_of_lexica)
        feature_vec = [
            (len(doc)/seq_len), (sum([token.text.isupper() for token in doc])/len(doc)), 
            (len(doc.ents)/len(doc)),get_cons_punct_count(pos), 
            emotion_words[0]/len(doc), emotion_words[1]/len(doc), emotion_words[2]/len(doc), emotion_words[3]/len(doc)]
        return feature_vec, pos, stems
    return [], [], []

In [104]:
lexica_names = ["clean_happiness", "clean_sadness", "clean_anger", "clean_fear"]
list_of_lexica = [pd.read_csv("../../lexica/" + dataset_name + ".csv") for dataset_name in lexica_names]
seq_len = {"norm_tweet": 81, "norm_emotion": 32}

classifier = ["norm_emotion_full_logistic_regression",
              "norm_emotion_lex_logistic_regression",
              "norm_emotion_vec-unigram_logistic_regression",
              "norm_emotion_vec-bigram_logistic_regression",
              "norm_emotion_topics_logistic_regression",
              "norm_tweet_full_logistic_regression",
              "norm_tweet_lex_logistic_regression",
              "norm_tweet_vec-unigram_logistic_regression",
              "norm_tweet_vec-bigram_logistic_regression",
              "norm_tweet_topics_logistic_regression",
              "norm_emotion_full_random_forests",
              "norm_emotion_lex_random_forests",
              "norm_emotion_vec-unigram_random_forests",
              "norm_emotion_vec-bigram_random_forests",
              "norm_emotion_topics_random_forests",
              "norm_tweet_full_random_forests",
              "norm_tweet_lex_random_forests",
              "norm_tweet_vec-unigram_random_forests",
              "norm_tweet_vec-bigram_random_forests",
              "norm_tweet_topics_random_forests",
              "net_lin_emotion(full)",
              "net_lin_emotion(lex)",
              "net_lin_emotion(vec-unigram)",
              "net_lin_emotion(vec-bigram)",
              "net_lin_emotion(topics)",
              "net_lin_tweet(full)",
              "net_lin_tweet(lex)",
              "net_lin_tweet(vec-unigram)",
              "net_lin_tweet(vec-bigram)",
              "net_lin_tweet(topics)"]

c_name = classifier[1]



              
classifier = ["lr", "rf", "net"]
datasets = ["norm_emotion", "norm_tweet"]
features_sets = ["full", "lex", "vec-unigram", "vec-bigram", "topics"]

def load_classifier(c_name):
    if "logistic_regression" in c_name:
        c = load("../models/logistic_regression/" + c_name + ".joblib") 
    elif "random_forests" in c_name: 
        c = load("../models/random_forests/" + c_name + ".joblib") 
    elif "net" in c_name: 
        c = Net_Lin_Emotion_All()
        c.load_state_dict(torch.load("../nets/net_lin_emotion_all.pt"))
    print(c)
    return c
        
def classify_input(c, input_message):
    fvec = extract_features(list_of_lexica, input_message, seq_len[dataset_name])
    for a in fvec: print(a)
    print("prediction: ", c.predict([fvec[0]]))
    print("predict_proba", c.predict_proba([fvec[0]]))

In [106]:
lr_test = "norm_emotion_full_logistic_regression"
rf_test = "norm_emotion_full_random_forests"
net_test = "net_lin_emotion(full)"

c = load_classifier(rf_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=21,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [107]:
input_message = "Hello, this is absurd my house is not a glitter awful"
classify_input(c, input_message)

[0.07407407407407407, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 0.0]
[91, 97, 84, 92, 92, 84]
['hello', 'absurd', 'hous', 'glitter', 'aw']
prediction:  [0]
predict_proba [[0.27901031 0.27653536 0.27236148 0.17209284]]
