## Load needed things

### Import packages

In [None]:
import numpy as np
import nltk
import pandas as pd
import xml.etree.ElementTree as ET
from collections import namedtuple
from os import listdir
from os.path import isfile, join
import re
import string
from typing import Callable, List
from spellchecker import SpellChecker
import urllib.request
import json
from tqdm import tqdm
import spacy
import ftfy
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import KFold
from polyglot.text import Text
from mtranslate import translate
from sklearn.linear_model import LogisticRegression


%matplotlib inline
tqdm.pandas()

### Define needed variables

In [None]:
tweet = namedtuple('Tweet', ['tweetid', 'content', 'polarity'])
stopWords = set(nltk.corpus.stopwords.words('spanish'))
nlp = spacy.load('es')
spell = SpellChecker(language='es')

## Preprocessing

### Read data

In [None]:
def read_data(file: str) -> pd.DataFrame:
    """Read data from given file and return it as a dataframe."""
    tweets: List = []
    with open(file, 'r') as f:
        tree = ET.parse(file)
        root = tree.getroot()
        for child in root:
            tweets.append(tweet(child[0].text, child[2].text, child[5][0][0].text))
    return pd.DataFrame(tweets)

def read_folder(folder: str) -> pd.DataFrame:
    """
    Read data from given folder, combines the training and dev set
    and return them combined as a dataframe.
    """
    dataframes = []
    files = [f for f in listdir(folder) if isfile(join(folder, f))]
    for file in files:
        if 'xml' in file:
            dataframes.append(read_data(folder + file))
    return pd.concat(dataframes)

### String manipulation

Don't need to care for emoticons, because there are less than 10.

In [None]:
def remove_mention(tweet: str) -> str:
    return re.sub(r'@[A-Za-z0-9]+', '', tweet)

def lower_case(tweet: str) -> str:
    """Turn a tweet to lower case."""
    return tweet.lower()

def remove_question_mark(tweet: str) -> str:
    """Remove spanish question mark from a tweet."""
    return tweet.replace('¿', '')

def remove_punctuation(tweet: str) -> str:
    """Remove punctuation from a tweet."""
    return tweet.translate(str.maketrans('', '', string.punctuation))

def remove_whitespace(tweet: str) -> str:
    return tweet.strip()

def check_int(s):
    if s[0] in ('-', '+'):
        return s[1:].isdigit()
    return s.isdigit()

def remove_numbers(tweet: str) -> str:
    """Remove numbers from tweet."""
    tokenized = nltk.word_tokenize(tweet)
    return ' '.join([word for word in tokenized if not check_int(word)])

def fix_encoding(tweet: str) -> str:
    return ftfy.fix_encoding(tweet)

def fix_repeated_letters(tweet: str) -> str:
    """Replace repeated characters (3 repetitions or more) with only two characters."""
    return re.sub(r'(.)\1+', r'\1\1', tweet)

def fix_spelling(tweet: str) -> str:
    """Fix spelling error in tweets."""
    tokenized = nltk.word_tokenize(tweet)
    misspelled = spell.unknown(tokenized)
    for i in range(len(tokenized)):
        if tokenized[i] in misspelled:
            tokenized[i] = spell.correction(tokenized[i])
    return ' '.join(tokenized)

def clean_tweet(tweet: str) -> str:
    """Run a tweet through cleaning pipeline."""
    # List of function
    functions: List[Callable] = [
                 remove_mention,
                 lower_case,
                 remove_question_mark,
                 remove_punctuation,
                 remove_numbers,
                 remove_whitespace,
                 fix_repeated_letters,
                 fix_encoding,
                 fix_spelling
                 ]
    for f in functions:
        tweet = f(tweet)
        
    return tweet

### Spelling
Preprocessing that should be done after spell correction.

In [None]:
def remove_stopwords(tweet: str) -> str:
    """Remove stopwords from tweet."""
    tokenized = nltk.word_tokenize(tweet)
    return ' '.join([word for word in tokenized if word not in stopWords])

def stem_tweet(tweet: str) -> str:
    tweet = nlp(tweet)
    return ' '.join([token.lemma_ for token in tweet])


def clean_tweet2(tweet: str) -> str:
    """Run a tweet through cleaning pipeline."""
    # List of function
    functions: List[Callable] = [
                 remove_stopwords,
                 stem_tweet
                 ]
    for f in functions:
        tweet = f(tweet)
        
    return tweet

In [None]:
def augment_data(df):
    new_trans = 2
    new_df = df.copy()
    new_df['augmented'] = 0
    for index, row in tqdm(new_df.iterrows()):
        trans_en = translate(translate(row['content'], 'en', 'es'), 'es', 'en')

        new_row_en = {'tweetid': row['tweetid'],
                      'content': trans_en,
                      'polarity': row['polarity'],
                      'augmented': 1}

        new_df = new_df.append(new_row_en, ignore_index=True)
    return new_df

### Clean and save

In [None]:
df = read_data("data/cr/intertass_cr_train.xml")
df['content'] = df['content'].progress_apply(clean_tweet)
df.to_csv("data/cr/cleaned.csv", index=False)

In [None]:
df = pd.read_csv("data/cr/cleaned.csv")
df = augment_data(df)
df.to_csv("data/cr/cleaned_augmented.csv", index=False)

In [None]:
df = pd.read_csv("data/cr/cleaned_augmented.csv")
df['content'] = df['content'].progress_apply(clean_tweet2)
df = df.drop_duplicates('content', keep='first')
df.to_csv("data/cr/cleaned_augmented_complete.csv")

## Explore data

### Load data from file

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
polarity_counts = Counter(df.polarity.values)
plotdf = pd.DataFrame.from_dict(polarity_counts, orient='index')
plotdf.plot(kind='bar')

## Classifier

### Turn tweets into features

In [None]:
all_words = nltk.FreqDist([word for tweet in df.content for word in nltk.word_tokenize(tweet)])
word_features = list(all_words) # [_document-classify-all-words]

def document_features(document): # [_document-classify-extractor]
    document_words = set(nltk.word_tokenize(document)) # [_document-classify-set]
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
        
    text = Text(document)
    text.language = 'es'
    features['positive'] = sum([1 for w in text.words if w.polarity > 0])
    features['neutral'] = sum([1 for w in text.words if w.polarity == 0])
    features['negative'] = sum([1 for w in text.words if w.polarity < 0])
    
    return features
df_sub = df
featuresets = [(document_features(d), c) for (d,c) in zip(df_sub.content, df_sub.polarity)]

In [None]:
n = 10
kf = KFold(n_splits=n, shuffle=True)
total = 0
for train, test in kf.split(featuresets):
    classifier = SklearnClassifier(LogisticRegression()).train(np.array(featuresets)[train])
    total += nltk.classify.accuracy(classifier, (np.array(featuresets)[test]))
print("Accuracy:", total / n)

In [None]:
classifier = SklearnClassifier(LogisticRegression()).train(featuresets)

In [None]:
df_test = pd.read_csv("data/cr/test_cleaned.csv")
featuresets_test = [(document_features(d), c) for (d,c) in zip(df_test.content, df_test.polarity)]
nltk.classify.accuracy(classifier, featuresets_test)

In [None]:
classifier.show_most_informative_features(100)