In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import nltk
from nltk.corpus import opinion_lexicon
from autocorrect import spell
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import multiprocessing
from multiprocessing import Pool
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('opinion_lexicon')
nltk.download('stopwords')

np.random.seed(0)

# Loading current instance resource
num_partitions = multiprocessing.cpu_count()
num_cores = multiprocessing.cpu_count()

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/aiavorskii/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aiavorskii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/aiavorskii/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aiavorskii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train = pd.read_csv('train.csv', encoding = "ISO-8859-1")
test = pd.read_csv('test.csv', encoding = "ISO-8859-1")

In [7]:
def abbreviation_replacement(text):
    text = (text.replace(r"i'm", "i am")
        .replace(r"'re", " are")
        .replace(r"he's", "he is")
        .replace(r"it's", "it is")
        .replace(r"that's", "that is")
        .replace(r"who's", "who is")
        .replace(r"what's", "what is")
        .replace(r"n't", " not")
        .replace(r"'ve", " have")
        .replace(r"'d", " would")
        .replace(r"'ll", " will")
        .replace(r",", " , ")
        .replace(r"!", " ! ")
        .replace(r".", " . ")
        .replace(r"(", " ( ")
        .replace(r")", " ) ")
        .replace(r"?", " ? "))
    return text

def replace_specific_symbols(text):
    text = re.sub(r'(.*)&lt;([-])+(.*)', r'\1 \3', text).replace('&quot;', '')
    return text.lower()

def emphasize_pos_and_neg_words(text):
    t = []
    for w in text.split():
        if w in positive_word_library:
            t.append('<positive>')
        elif w in negative_word_library:
            t.append('<negative>')
        else:
            t.append(w)
    newTweet = " ".join(t)
    return newTweet

def remove_usernames(text):
    text = " ".join(filter(lambda x:x[0] != '@', text.split()))
    return text
# TODO: process shortcuts, dunno, 2day, 4, 4ever

def get_emoji_options():
    loves = ["<3", "♥"]
    smilefaces = []
    sadfaces = []
    neutralfaces = []

    eyes = ["8",":","=",";"]
    nose = ["'","`","-","\\"]
    for e in eyes:
        for n in nose:
            for s in [")", "d", "]", "}","p"]:
                smilefaces.append(e+n+s)
                smilefaces.append(e+s)
            for s in ["(", "[", "{"]:
                sadfaces.append(e+n+s)
                sadfaces.append(e+s)
            for s in ["|", "/", r"\\"]:
                neutralfaces.append(e+n+s)
                neutralfaces.append(e+s)
            #reversed
            for s in ["(", "[", "{"]:
                smilefaces.append(s+n+e)
                smilefaces.append(s+e)
            for s in [")", "]", "}", "/"]:
                sadfaces.append(s+n+e)
                sadfaces.append(s+e)
            for s in ["|", "/", r"\\"]:
                neutralfaces.append(s+n+e)
                neutralfaces.append(s+e)

    smilefaces = list(set(smilefaces))
    sadfaces = list(set(sadfaces))
    neutralfaces = list(set(neutralfaces))
    return loves,smilefaces,sadfaces,neutralfaces

loves,smilefaces,sadfaces,neutralfaces = get_emoji_options()

def emoji_translation(text):
    t = []
    for w in text.split():
        if w in loves:
            t.append("<positive>")
#             t.append("<love>")
        elif w in smilefaces:
            t.append("<positive>")
            #t.append("<happy>")
        elif w in neutralfaces:
            t.append("<positive>")
#             t.append("<neutral>")
        elif w in sadfaces:
            t.append("<negative>")
#             t.append("<sad>")
        else:
            t.append(w)
    newText = " ".join(t)
    return newText

def replace_multiple_dots(text):
    text = '. '.join(re.split("\.+", text))
    return text

def extract_hashtag(text):
    hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")])
    return " ".join(hash_list)

def correct_spelling_errors(text):
    """Delete repeated symbols in every word from text up to 2, apply spelling correction."""
    return ' '.join(spell(re.sub(r'(.)\1+', r'\1\1', word)) for word in text.split())

stoplist = stopwords.words('english')

def remove_stopwords(text):
    tokens = text.split()
    for word in tokens:
        if word in stoplist:
            tokens.remove(word)
    return ' '.join(tokens)

# Initialize NLTK function
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()


#  Lemmatization
def lemmatize_word(w):
    try:
        x = lemma.lemmatize(w).lower()
        return x
    except Exception as e:
        return w


def lemmatize_sentence(text):
    x = [lemmatize_word(t) for t in text.split()]
    return " ".join(x)


# Stemming
def stemming_word(w):
    return stemmer.stem(w)


def stemming_sentence(text):
    x = [stemming_word(t) for t in text.split()]
    return " ".join(x)

def process_row(row):
    row = replace_specific_symbols(row)
    row = abbreviation_replacement(row)
    row = remove_usernames(row)
    row = emoji_translation(row)
    row = replace_multiple_dots(row)
    row = correct_spelling_errors(row)
    row = remove_stopwords(row)
    row = emphasize_pos_and_neg_words(row)
    row = stemming_sentence(row)
    row = lemmatize_sentence(row)
    return row.strip().lower()

def extract_features_subjectivity(text):
    sent          = TextBlob(text)
    # The polarity score is a float within the range [-1.0, 1.0]
    # where negative value indicates negative text and positive
    # value indicates that the given text is positive.
    #     polarity      = sent.sentiment.polarity
    # The subjectivity is a float within the range [0.0, 1.0] where
    # 0.0 is very objective and 1.0 is very subjective.
    subjectivity  = sent.sentiment.subjectivity
#     sent          = TextBlob(text, analyzer = NaiveBayesAnalyzer())
#     classification= sent.sentiment.classification
#     positive      = sent.sentiment.p_pos
#     negative      = sent.sentiment.p_neg
    return subjectivity

def extract_features_classification(text):
#     sent          = TextBlob(text)
    # The polarity score is a float within the range [-1.0, 1.0]
    # where negative value indicates negative text and positive
    # value indicates that the given text is positive.
#     polarity      = sent.sentiment.polarity
    # The subjectivity is a float within the range [0.0, 1.0] where
    # 0.0 is very objective and 1.0 is very subjective.
#     subjectivity  = sent.sentiment.subjectivity
    sent          = TextBlob(text, analyzer = NaiveBayesAnalyzer())
    classification= sent.sentiment.classification
#     positive      = sent.sentiment.p_pos
#     negative      = sent.sentiment.p_neg
    return classification
    
def apply_processing(data):
    data['SentimentText'] = data['SentimentText'].apply(lambda x: process_row(x))
#     data['HashTag'] = data['SentimentText'].apply(lambda x: extract_hashtag(x))
#     data = data.apply(extract_features, axis=1)
    data['subjectivity'] = data['SentimentText'].apply(extract_features_subjectivity)
    data['classification'] = data['SentimentText'].apply(extract_features_classification)
    return data

# processed_train = apply_processing(train)

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
x_test = parallelize_dataframe(test, apply_processing)
x_test.to_pickle("test_clean.pkl")



In [None]:
x_train = parallelize_dataframe(train, apply_processing)
x_train.to_pickle("train_clean.pkl")

In [8]:
correct_spelling_errors(re.sub(r'(.)\1+', r'\1\1', "juuuuuuuuuuuuuuuuussssst"))


'just'