# Imports and reading data.

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import shlex, subprocess
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from collections import Counter
from tensorflow.contrib import rnn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

%pylab inline

data = pd.read_csv('text_emotion.csv')
names = data.columns.values.tolist()

In [None]:
#Preprocessing, select 5 most frequent labels and map them to float number
#for purpose of easy labeling for nn.
baseline = ['sadness', 'worry', 'happiness', 'love', 'neutral']
data = data[[(sentiment in baseline) for sentiment in data['sentiment']]]
data['sentiment'] = data['sentiment'].map({'sadness' : 1.0, 'worry' : 2.0,
                        'happiness' : 3.0, 'love' : 4.0, 'neutral' : 5.0})
train_data, test_data = train_test_split(data, test_size=0.2)

In [None]:
def draw_histogram(data):
    labels,values = zip(*Counter(data['sentiment']).items())
    indexes = np.arange(len(labels))
    width = 0.5
    plt.xticks(indexes + width * 0.5, labels, rotation = 'vertical')
    plt.bar(indexes, values, width)

In [None]:
#distribution of train data
draw_histogram(train_data)

In [None]:
#distribution of test_data
draw_histogram(test_data)

# Extracting features

In [None]:
def preprocess_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    stemmer = None #PorterStemmer()
    tokens = word_tokenize(sentence)
    lemmas = []
    skip = False
    for i in range(len(tokens)):
        if tokens[i] == '@':
            skip = True
        elif skip:
            skip = False
        else:
            lemmas.append(lemmatizer.lemmatize(tokens[i])) 
    stems = []
    if stemmer != None:
        for lemma in lemmas:
            stems.append(stemmer.stem(lemma))
        return " ".join(stems)
    return " ".join(lemmas)
            
#converting data to fit the shape appropriate for rnn which expects:
#tensor[B,T....] where B - batch size, T - Timesteps, and then goes features
data['content'] = list(map(preprocess_sentence, data['content']))
vectorizer = TfidfVectorizer(stop_words = 'english')
X_train_TF = pd.DataFrame(vectorizer.fit_transform(train_data['content']).todense())
X_test_TF = pd.DataFrame(vectorizer.transform(test_data['content']).todense())
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']
print(X_test_TF.shape)

In [None]:
 """takes a list of tokens for which returns following features as a list
1 - number of exlamation marks
2 - number of question marks
3 - number of quotes
4 - number of words in upper case
5 - boolean value if in the tokens there is containing 3+ single-character repetion."""
def get_second_feature_block_values(tokens):
    exl_marks_number  = 0
    ques_marks_number = 0
    quotes_number = 0
    upper_case_words = 0
    vowel_repetition = False
    for word in tokens:
        exl_marks_number += word.count("!")
        ques_marks_number += word.count("?")
        quotes_number += word.count('"')
        if word.isupper():
            upper_case_words += 1
        if contains_vowel_repetition(word):
            vowel_repetition = True
    return [exl_marks_number, ques_marks_number, quotes_number, upper_case_words, vowel_repetition]

"""returns true if the word containing 3+ single-character repetion"""    
def contains_vowel_repetition(word):
    result = False
    for i in range(len(word)-2):
        if word[i] == word[i+1] and word[i] == word[i+2]:
            result = True
    return result

"""takes a list of tokens for which returns following features as a list
1 - total positive score
2 - total negative score
3 - number of highly emotional negative words(w_score <= -3)
4 - number of highly emotional positive words(w_score >= 3)
5 - ratio of emotional words."""
def get_first_features_block_values(tokens):
    sentence = ' '.join(tokens)
    total_scores = get_sentiment_rate(sentence)
    total_positive_score = total_scores[0]
    total_negative_score = total_scores[1]
    hepw_count = 0
    henw_count = 0
    for token in tokens:
        sentiment_rate = get_sentiment_rate(token)
        if sentiment_rate[0] > sentiment_rate[1] and sentiment_rate[0] >= 3:
            hepw_count += 1
        elif sentiment_rate[0] < sentiment_rate[1] and sentiment_rate[1]>= 3:
            henw_count += 1
    if total_positive_score <=1 and total_negative_score <=1:
        ratio = 0
    else :
        ratio = (total_positive_score - total_negative_score)/(total_positive_score + total_negative_score)
    return [total_positive_score, total_negative_score, hepw_count, henw_count, ratio]

"""for specified string returns its sentiStrength score as a pair of numbers
first corresponds to positive rate, second for negative."""
def get_sentiment_rate(sourceString):
    process = subprocess.Popen(shlex.split("java -jar /home/george/ipython/SentiStrength.jar stdin sentidata \
        /home/george/ipython/data/"), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
    stdout_response, stderr_responce = process.communicate(sourceString.replace(" ","+").encode())
    stdout_response = [int(s) for s in str(stdout_response) if s.isdigit()]
    return stdout_response

def get_wrapped_features(tokens):
    first_block = get_first_features_block_values(tokens)
    second_block = get_second_feature_block_values(tokens)
    first_block.extend(second_block)
    return first_block

def extend_data_frame(data_frame, number_features):
    features_num = data_frame.shape[1]
    for i in range(number_features):
        position = features_num + 1 + i
        data_frame.assign(position = np.nan)
    return data_frame

def join_features(data_frame, features, index):
    features_num = data_frame.shape[1]
    for i in range(len(features)):
        data_frame.set_value(index, features_num + 1 + i, features[i])

In [None]:
features_sets = []
i = 0
k = 0
for content in test_data['content']:
    features_sets.append(get_wrapped_features(word_tokenize(content)))
    k += 1
    if k % 100 == 0:
        print(k//100)
print("features done")

In [None]:
X_test = extend_data_frame(X_test_TF, 10)
print("frame extended")

In [None]:
features_for_test = pd.DataFrame(features_sets)
features_for_test.head()

In [None]:
train_data.to_csv("train_raw_data.csv", sep='\t')
test_data.to_csv("test_raw_data.csv", sep='\t')
Y_train.to_csv("y_train.csv", sep='\t')
Y_test.to_csv("y_test.csv", sep='\t')

In [None]:
for feature_set in features_sets:
    join_features(X_test, feature_set, i)
    i += 1
    if i % 100 == 0:
        print(i//100)
X_test.head()

In [None]:
features_sets_tr = []
it = 0
kt = 0
for content in train_data['content'][:6262]:
    features_sets_tr.append(get_wrapped_features(word_tokenize(content)))
    kt += 1
    if kt % 100 == 0:
        print(kt)
print("features done")

X_train = extend_data_frame(X_train_TF, 10)
print("frame extended")
for feature_set in features_sets_tr:
    join_features(X_train, feature_set, it)
    it += 1
    if it % 100 == 0:
        print(it//100)

In [None]:
features_for_train.to_csv('features_for_train.csv', mode='a', header=False, sep='\t')