## Load needed things

### Import packages

In [8]:
import numpy as np
import nltk
import pandas as pd
import xml.etree.ElementTree as ET
from collections import namedtuple
from os import listdir
from os.path import isfile, join
import re
import string
from typing import Callable, List
import unidecode
from spellchecker import SpellChecker
import urllib.request
import json
from tqdm import tqdm
tqdm.pandas()

### Define named tuple

In [2]:
tweet = namedtuple('Tweet', ['tweetid', 'content', 'polarity'])

### Define stopwords

In [9]:
stopWords = set(nltk.corpus.stopwords.words('spanish'))

## Preprocessing

### Read data

In [3]:
def read_data(file: str) -> pd.DataFrame:
    """Read data from given file and return it as a dataframe."""
    tweets: List = []
    with open(file, 'r') as f:
        tree = ET.parse(file)
        root = tree.getroot()
        for child in root:
            tweets.append(tweet(child[0].text, child[2].text, child[5][0][0].text))
    return pd.DataFrame(tweets)

def read_folder(folder: str) -> pd.DataFrame:
    """
    Read data from given folder, combines the training and dev set
    and return them combined as a dataframe.
    """
    dataframes = []
    files = [f for f in listdir(folder) if isfile(join(folder, f))]
    for file in files:
        if 'xml' in file:
            dataframes.append(read_data(folder + file))
    return pd.concat(dataframes)

### String manipulation

In [4]:
def remove_mention(tweet: str) -> str:
    return re.sub(r'@[A-Za-z0-9]+', '', tweet) 

def lower_case(tweet: str) -> str:
    """Turn a tweet to lower case."""
    return tweet.lower()

def remove_question_mark(tweet: str) -> str:
    """Remove spanish question mark from a tweet."""
    return tweet.replace('¿', '')

def remove_punctuation(tweet: str) -> str:
    """Remove punctuation from a tweet."""
    return tweet.translate(str.maketrans('', '', string.punctuation))

def remove_accents(tweet: str) -> str:
    """Remove accents from a tweet."""
    return unidecode.unidecode(tweet)

def remove_whitespace(tweet: str) -> str:
    return tweet.strip()

def clean_tweet(tweet: str) -> str:
    """Run a tweet through cleaning pipeline."""
    # List of function
    functions: List[Callable] = [
                 remove_mention,
                 lower_case,
                 remove_question_mark,
                 remove_punctuation,
                 remove_accents,
                 remove_whitespace
                 ]
    for f in functions:
        tweet = f(tweet)
        
    return tweet

### Spelling

In [17]:
spell = SpellChecker(language='es')

def fix_spelling(tweet: str) -> str:
    """Fix spelling error in tweets."""
    tokenized = nltk.word_tokenize(tweet)
    misspelled = spell.unknown(tokenized)
    for i in range(len(tokenized)):
        if tokenized[i] in misspelled:
            contents = urllib.request.urlopen(f"http://api.urbandictionary.com/v0/define?term={tokenized[i]}").read()
            contents = json.loads(contents)['list']
            
            # Check if the word is spelling error or is in urban dictionary
            if contents and contents[0]['word'].lower().strip() == tokenized[i]:
                continue
            tokenized[i] = spell.correction(tokenized[i])
    return ' '.join(tokenized)

def remove_stopwords(tweet: str) -> str:
    """Remove stopwords from tweet."""
    tokenized = nltk.word_tokenize(tweet)
    return ' '.join([word for word in tokenized if word not in stopWords])

def check_int(s):
    if s[0] in ('-', '+'):
        return s[1:].isdigit()
    return s.isdigit()

def remove_numbers(tweet: str) -> str:
    """Remove numbers from tweet."""
    tokenized = nltk.word_tokenize(tweet)
    return ' '.join([word for word in tokenized if not check_int(word)])

def clean_tweet2(tweet: str) -> str:
    """Run a tweet through cleaning pipeline."""
    # List of function
    functions: List[Callable] = [
                 remove_stopwords,
                 remove_numbers
                 ]
    for f in functions:
        tweet = f(tweet)
        
    return tweet

### Clean and save

In [18]:
df = read_folder("data/cr/")
df['content'] = df['content'].progress_apply(clean_tweet)
df['content'] = df['content'].progress_apply(clean_tweet2)
# df['content'] = df['content'].progress_apply(fix_spelling)
# df.to_csv(data/cr/cleaned.csv)

100%|██████████| 1167/1167 [00:00<00:00, 34358.55it/s]
100%|██████████| 1167/1167 [00:00<00:00, 4468.49it/s]


In [19]:
df

Unnamed: 0,tweetid,content,polarity
0,768225400254111744,totalmente puntual,NONE
1,770077064833671168,hola sandrita habia deseado feliz dia madre ta...,P
2,771207534342320128,si andan haciendo mejor quedaran calladitas ja...,N
3,771900763987513345,pereza quiero choco banano,N
4,772550560998301697,bueno mayor cuanto campo usted sos cartaguito ...,N
5,772594807357124608,pase dia buscando baby lips acabo encontrar me...,NEU
6,772264329433509888,halfon germinal ve mortal mary jo bang volando...,P
7,771140523562143744,ahorita van cambiar ligas brakets alfajores ar...,NONE
8,773525753082187776,amor paciente bondadoso envidioso orgulloso eg...,P
9,774595982600253441,amanecer respirando o2 puro mas regalo mas si ...,P
