In [1]:
!ls ./data/

processedNegative.csv processedNeutral.csv  processedPositive.csv


> Accuracy goal on a test dataset > 0.873

# Imports
---

In [2]:
import pandas as pd
import string
import re

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

import unidecode
import contractions
# from word2number import w2n

In [3]:
negative_tw = pd.read_csv('data/processedNegative.csv')
neutral_tw = pd.read_csv('data/processedNeutral.csv')
positive_tw = pd.read_csv('data/processedPositive.csv')

In [4]:
set_of_stopwords = set(stopwords.words('english'))

# Preproccessing
---

In [5]:
def remove_accented_chars(text):
    """
    Removes accented chars
    Ex.: cafè --> cafe
    """
    return unidecode.unidecode(text)

In [6]:
def expand_contractions(text):
    """
    Expands abbreviated words
    Ex.: don't --> do not
    """
    return contractions.fix(text)

In [15]:
def preprocess_text(text, stemming=None, lemmatization=False, misspellings=False):
    # removing everything except words
    text = re.sub('[^A-Za-z ]', '', text).lower()

    if misspellings:
        text = re.sub('.{3,}', '', text)
    
    # removing accented chars, expanding contractions, splitting into words
    words = expand_contractions(remove_accented_chars(text)).split()
    # removing stopwords
    words = [w for w in words if w not in set_of_stopwords]
    
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(w) for w in words]
    
    if stemming:
        if stemming.lower() == 'snowball':
            stemmer = SnowballStemmer(language='english')
        else:
            stemmer = PorterStemmer()
        words = [stemmer.stem(w) for w in words]
        
    
    return words

In [18]:
preprocess_text('Riverdale tonight ? 12 (( unhappy sings cared singing feet bats', lemmatization=True)

['riverdale', 'tonight', 'unhappy', 'sings', 'cared', 'singing', 'foot', 'bat']

In [54]:
tmp = re.findall("([a-z])\0{2,}", 'unhapppyy atttracted')
tmp

[]