In [1]:
import pandas as pd
import spacy
import os
import nltk
from nltk.stem.porter import *
import numpy as np

In [2]:
def log(summary):
    print(summary)
    
def make_data(dataset, lexica=False):
    if not lexica: 
        datasets = []
        for file in dataset:
            datasets.append(pd.read_csv("../raw/" + file + ".csv")) 
        dataset = pd.concat(datasets, axis=0, ignore_index=True)
        return dataset
    else:
        lexica = []
        for file in dataset: 
            lexicon = pd.read_csv("../../lexica/" + file + ".csv")
            lexica.append(lexicon["stems"].tolist())
        return lexica
    
def extract_features(dataset, list_of_lexica, save_name, print_row): 
    cleaned_dataset = []
    pos, stems = [], []
    lexica_words = np.zeros((len(dataset.index), 4))
    # pos 97 = Satzzeichen, 103 = Leerzeichen
    print("analyzing sentence features:", save_name)
    for index, row in dataset.iterrows():
        if index % print_row == 0: log("... searching row " + str(index) + "/" + str(len(dataset)))
        doc = nlp(split_punct(row["text"]))
        doc = nlp(" ".join([token.text for token in doc if not token.is_stop and token.pos != 103]))
        if len(doc) != 0:
            pos.append([token.pos for token in doc])
            stems.append([stemmer.stem(token.text) for token in doc if token.pos != 97])
            emotion_words = get_emotion_words(stems[-1:][0], 0, list_of_lexica)
            cleaned_dataset.append([
                " ".join([token.text for token in doc]), row["affect"],
                len(doc), (sum([token.text.isupper() for token in doc])/len(doc)), 
                (len(doc.ents)/len(doc)),get_cons_punct_count(pos[-1:][0]), 
                emotion_words[0], emotion_words[1], emotion_words[2], emotion_words[3]])
        
    seq_len = max([row[2] for row in cleaned_dataset])
    pos = extend_list(pos, seq_len, 999)
    stems = extend_list(stems, seq_len, "__")
    
    df = pd.DataFrame(data=cleaned_dataset, columns=["t", "a", "wc", "uwc", "ewc", "cpc", "hc", "sc", "fc", "ac"])
    df.to_csv("../cleaned/" + save_name + "_clean.csv", sep=",", index=False, float_format='%.3f')
    df = pd.DataFrame(data=pos)
    df.to_csv("../cleaned/" + save_name + "_pos.csv", sep=",", index=False, float_format='%.3f')
    df = pd.DataFrame(data=stems)
    df.to_csv("../cleaned/" + save_name + "_stems.csv", sep=",", index=False, float_format='%.3f')
    
def extend_list(l, seq_len, extension):
    for index, row in enumerate(l):
        row.extend([extension] * (seq_len - len(row)))
    return l
    
def split_punct(text):
    replacement = [(".", " . "), (",", " , "), ("!", " ! "), ("?", " ? ")]
    for k, v in replacement: 
        text = text.replace(k, v)
    return text
    
def get_emotion_words(stems, emotion_index, list_of_lexica):
    emotion_words = np.zeros(4)
    for index, lexicon in enumerate(list_of_lexica): 
        for stem in stems:
            if stem in lexicon:
                emotion_words[index] = emotion_words[index] + 1
    return emotion_words

def get_cons_punct_count(pos):
    cons_punct_count = 0
    for index, item in enumerate(pos[:-1]):
        if item == 97 and item == pos[index+1]:
            cons_punct_count += 1
    return cons_punct_count

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'
nlp = spacy.load("en_core_web_lg")
stemmer = nltk.stem.SnowballStemmer('english')

In [4]:
tweet_datasets = ["emoint", "crowdflower", "tec"]
emotion_datasets = ["emotion_classification_1", "emotion_classification_2", "emotion_classification_3", "emotion_classification_4", "emotion_classification_5","emotion_classification_6","emotion_classification_7","emotion_classification_8"]
lexica = ["clean_happiness", "clean_sadness", "clean_anger", "clean_fear"]
list_of_lexica = make_data(lexica, True)

In [5]:
#tweet_dataset = make_data(tweet_datasets)
#emotion_dataset = make_data(emotion_datasets)
#extract_features(tweet_dataset, list_of_lexica, "tweet", 500)
#extract_features(emotion_dataset, list_of_lexica, "emotion", 1000)

extract_features(make_data(["test"]), list_of_lexica, "test", 500)

analyzing sentence features: test
... searching row 0/33
