In [16]:
import pandas as pd

### iSEAR preprocessing

In [17]:
df = pd.read_csv("./dataset/original_datasets/isear.csv", sep="|")

In [18]:
df = df[["Field1", "SIT"]]
df = df.rename(columns={"Field1": "emotion", "SIT": "comment"})

In [19]:
df = df.drop(df[df.comment.str.startswith("[") & df.comment.str.endswith("]")].index)

In [20]:
df = df.drop_duplicates(subset="comment")

In [21]:
df['emotion'].value_counts()

fear       1073
anger      1070
joy        1066
disgust    1063
guilt      1045
sadness    1040
shame      1040
Name: emotion, dtype: int64

In [22]:
df = df.drop(df[df.emotion == "shame"].index)
df = df.drop(df[df.emotion == "guilt"].index)

In [125]:
from spellchecker import SpellChecker

spell = SpellChecker()

def spell_check(x):
    correct_word = []
    mispelled_word = x.split()
    for word in mispelled_word:
        correct_word.append(spell.correction(word))
#     print("dziala")
    return ' '.join(correct_word)

In [24]:
df['comment'] = df['comment'].apply(lambda x: spell_check(x))

In [25]:
df

Unnamed: 0,emotion,comment
0,joy,During the period of falling in love each time...
1,fear,When I was involved in a traffic accident
2,anger,When I was driving home after several days of ...
3,sadness,When I lost the person who meant the most to me
4,disgust,The time I knocked a deer down - the sight of ...
...,...,...
7659,joy,I received a letter from a distant friend
7660,fear,My parents were out and I was the eldest at ho...
7661,anger,Two years back someone invited me to be the tu...
7662,sadness,I had taken the responsibility to do something...


In [41]:
df = df.drop(df[df['comment'].str.split(" ").apply(len) < 5].index)

In [42]:
df.describe()

Unnamed: 0,emotion,comment
count,5149,5149
unique,5,5149
top,anger,During the period of falling in love each time...
freq,1043,1


In [43]:
df['emotion'].value_counts()

anger      1043
fear       1042
joy        1040
disgust    1039
sadness     985
Name: emotion, dtype: int64

In [44]:
import re

def remove_non_utf_8_chars_from_text(text):
    pattern = re.compile('(?i)[^a-z0-9 \n]+')
    return pattern.sub('', text)

In [46]:
df['comment'] = df['comment'].apply(remove_non_utf_8_chars_from_text)

In [52]:
df['comment'] = df['comment'].str.lower()

In [54]:
import contractions

df['comment'] = df['comment'].apply(contractions.fix)

In [56]:
df.to_csv("./dataset/preprocessed_datasets/isear_preprocessed.csv", sep=",", index=False)

### tweet_emotion preprocessing

In [108]:
df = pd.read_csv("./dataset/original_datasets/tweet_emotions.csv", sep=',')
df = df[["sentiment", "content"]]
df = df.rename(columns={"sentiment": "emotion", "content": "comment"})

In [109]:
df = df.drop(df[df.emotion == "neutral"].index)
df = df.drop(df[df.emotion == "hate"].index)
df = df.drop(df[df.emotion == "empty"].index)
df = df.drop(df[df.emotion == "boredom"].index)
df = df.drop(df[df.emotion == "enthusiasm"].index)
df = df.drop(df[df.emotion == "relief"].index)
df = df.drop(df[df.emotion == "fun"].index)
df = df.drop(df[df.emotion == "love"].index)

In [110]:
df['comment'] = df['comment'].str.lower()

In [111]:
def remove_links_and_tags_from_text(text):
    text = re.sub(
    r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
    '', text, flags=re.MULTILINE)
    text = (re.sub(r"[@](.*?)[\s]", '', text, flags=re.MULTILINE))
    return text

In [112]:
df['comment'] = df['comment'].apply(remove_links_and_tags_from_text)

In [113]:
df['comment'] = df['comment'].apply(contractions.fix)

In [114]:
df['comment'] = df['comment'].apply(remove_non_utf_8_chars_from_text)

In [115]:
df

Unnamed: 0,emotion,comment
1,sadness,layin n bed with a headache ughhhhwaitin on y...
2,sadness,funeral ceremonygloomy friday
5,worry,repinging why did not you go to prom bc my bf ...
6,sadness,i should be sleep but I am not thinking about ...
7,worry,hmmm is down
...,...,...
39986,happiness,going to watch boy in the striped pjs hope i d...
39987,happiness,gave the bikes a thorough wash degrease it and...
39988,happiness,had such and amazing time last night mcfly wer...
39994,happiness,succesfully following tayla


In [116]:
def remove_chained_whitespaces(text):
    for i in range(15):
        text = (re.sub("  ", ' ', text, flags=re.MULTILINE))
    return text

In [117]:
df['comment'] = df['comment'].apply(remove_chained_whitespaces)

In [118]:
df = df.drop(df[df['comment'].str.split(" ").apply(len) < 5].index)

In [119]:
df

Unnamed: 0,emotion,comment
1,sadness,layin n bed with a headache ughhhhwaitin on yo...
5,worry,repinging why did not you go to prom bc my bf ...
6,sadness,i should be sleep but I am not thinking about ...
8,sadness,charlene my love i miss you
9,sadness,I am sorry at least it is friday
...,...,...
39985,happiness,husband is golfing amp the toddler and i shall...
39986,happiness,going to watch boy in the striped pjs hope i d...
39987,happiness,gave the bikes a thorough wash degrease it and...
39988,happiness,had such and amazing time last night mcfly wer...


In [88]:
from textblob import TextBlob

def check_spelling(text):
    textBlb = TextBlob(text)
    return textBlb.correct() 

In [120]:
import jamspell

In [121]:
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('./pretrained_vectors/en.bin')

True

In [122]:
df['comment'] = df['comment'].apply(corrector.FixFragment)

In [123]:
df

Unnamed: 0,emotion,comment
1,sadness,layin n bed with a headache ughhhhwaitin on yo...
5,worry,repinging why did not you go to from bc my bf ...
6,sadness,i should be sleep but I am not thinking about ...
8,sadness,charlene my love i miss you
9,sadness,I am sorry at least it is friday
...,...,...
39985,happiness,husband is golfing amp the toddler and i shall...
39986,happiness,going to watch boy in the striped pjs hope i d...
39987,happiness,gave the bikes a thorough wash decrease it and...
39988,happiness,had such and amazing time last night mcfly wer...


In [126]:
df['comment'] = df['comment'].apply(lambda x: spell_check(x))

In [127]:
df.describe()

Unnamed: 0,emotion,comment
count,19359,19359
unique,5,19259
top,worry,i am lost please help me find a good home
freq,7836,15


In [128]:
df = df.drop_duplicates(subset="comment")

In [129]:
df.describe()

Unnamed: 0,emotion,comment
count,19259,19259
unique,5,19259
top,worry,laying i bed with a headache ughhhhwaitin on y...
freq,7788,1


In [130]:
df.to_csv("./dataset/preprocessed_datasets/tweet_emotion_preprocessed.csv", sep=",", index=False)

### Crawled dataset preprocessing

In [157]:
df_disgust = pd.read_csv("./dataset/original_datasets/twitter_crawl_data/output_obrzydzenie.txt", sep="\n", header=None)
df_disgust.insert(0, "emotion", "disgust")
df_disgust = df_disgust.rename(columns={0: "comment"})
df_disgust = df_disgust.drop_duplicates(subset="comment")

In [158]:
df_sadness = pd.read_csv("./dataset/original_datasets/twitter_crawl_data/output_smutek.txt", sep="\n", header=None)
df_sadness.insert(0, "emotion", "sadness")
df_sadness = df_sadness.rename(columns={0: "comment"})
df_sadness = df_sadness.drop_duplicates(subset="comment")

In [159]:
df_fear = pd.read_csv("./dataset/original_datasets/twitter_crawl_data/output_strach.txt", sep="\n", header=None)
df_fear.insert(0, "emotion", "fear")
df_fear = df_fear.rename(columns={0: "comment"})
df_fear = df_fear.drop_duplicates(subset="comment")

In [160]:
df_happiness = pd.read_csv("./dataset/original_datasets/twitter_crawl_data/output_szczescie.txt", sep="\n", header=None)
df_happiness.insert(0, "emotion", "happiness")
df_happiness = df_happiness.rename(columns={0: "comment"})
df_happiness = df_happiness.drop_duplicates(subset="comment")

In [161]:
df_surprise = pd.read_csv("./dataset/original_datasets/twitter_crawl_data/output_zaskoczenie.txt", sep="\n", header=None)
df_surprise.insert(0, "emotion", "surprise")
df_surprise = df_surprise.rename(columns={0: "comment"})
df_surprise = df_surprise.drop_duplicates(subset="comment")

In [162]:
df_anger = pd.read_csv("./dataset/original_datasets/twitter_crawl_data/output_zlosc.txt", sep="\n", header=None)
df_anger.insert(0, "emotion", "anger")
df_anger = df_anger.rename(columns={0: "comment"})
df_anger = df_anger.drop_duplicates(subset="comment")

In [163]:
df_all = pd.concat([df_disgust, df_sadness, df_fear, df_happiness, df_surprise, df_anger])

In [164]:
df_all.count()

emotion    8199
comment    8199
dtype: int64

In [168]:
df_all['emotion'].value_counts()

fear         2939
happiness    1790
sadness      1228
anger         918
surprise      683
disgust       641
Name: emotion, dtype: int64

In [169]:
df_all['comment'] = df_all['comment'].str.lower()

In [170]:
df_all['comment'] = df_all['comment'].apply(remove_links_and_tags_from_text)

In [171]:
df_all['comment'] = df_all['comment'].apply(contractions.fix)

In [172]:
df_all['comment'] = df_all['comment'].apply(remove_non_utf_8_chars_from_text)

In [173]:
df_all['comment'] = df_all['comment'].apply(remove_chained_whitespaces)

In [174]:
df_all = df_all.drop(df_all[df_all['comment'].str.split(" ").apply(len) < 5].index)

In [175]:
df_all['comment'] = df_all['comment'].apply(corrector.FixFragment)

In [176]:
df_all['comment'] = df_all['comment'].apply(lambda x: spell_check(x))

In [177]:
df_all

Unnamed: 0,emotion,comment
0,disgust,disgust is an important core emotion to learn ...
4,disgust,disgust and its close relative contempt can in...
5,disgust,can never participate in any form of any gover...
6,disgust,can someone plus hack this person he is a guy ...
7,disgust,I am with disgust not ready to come back to th...
...,...,...
3321,anger,excessive anger can because problems increased...
3323,anger,anger is in itself neither good nor bait is wh...
3324,anger,anger can be a positive and useful emotion if ...
3325,anger,way to manage anger think before you speakpici...


In [180]:
df_all['emotion'].value_counts()

fear         2909
happiness    1726
sadness      1189
anger         891
surprise      658
disgust       613
Name: emotion, dtype: int64

In [183]:
df_all.to_csv("./dataset/preprocessed_datasets/twitter_crawl_data_preprocessed.csv", sep=",", index=False)