# First look at the data
* Text data are difficult to deal with because they are non-structured
* Reddit data is particularly dirty (lots of non-ascii characters, punctuations and incomplete sentences)

Here we have the raw data extracted by the scraper, along with the labels of the categories we assigned to them.

In [4]:
import pandas as pd
import re
import unicodedata
import inflect
import contractions
import string
from sklearn.base import BaseEstimator, TransformerMixin
import spacy
nlp = spacy.load('en_core_web_sm')

In [109]:
file = '../datasets/reddit_raw_with_labels.csv'

In [118]:
class ValidTextCreator(BaseEstimator, TransformerMixin):
    LABELS = ['other', 'screeners', 'bad test', 'ratings', 'recorder', 'live convo', 'no test', 'mobile', 'bug', 'payment']
    VALID_COLS = ['title', 'score', 'num_comments', 'created_at', 'text', 'label']
    def __init__(self, cols_to_drop_na=None):
        if not isinstance(cols_to_drop_na, list):
            self.cols = [cols_to_drop_na]
        else:
            self.cols = cols_to_drop_na
        
    def _drop_na(self, X):
        return X.dropna(subset=self.cols)
    
    def _get_valid_labels_only(self, X):
        return X[X['label'].isin(self.LABELS)]
    
    def _concatenate_title_body(self, X):
        X['text'] = X['title'].fillna('') + ' ' + df['body'].fillna('')
        return X
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X = self._drop_na(X)
        X = self._get_valid_labels_only(X)
        X = self._concatenate_title_body(X)
        X = X[self.VALID_COLS]
        return X

In [119]:
df = pd.read_csv(file)

In [120]:
creator = ValidTextCreator(cols_to_drop_na='label')

In [122]:
cleaned_df = creator.transform(df)

Let's use the `label` column, which takes the top 100 labels of the `majority_vote`, and fill the rest with `category_sl`.

In [124]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
            
    def _remove_https_links(self, text):
        return re.sub(r'https?://\S+', '', text, flags=re.MULTILINE)

    def _replace_non_alphanumeric(self, text):
        return re.sub(r'[^\w\'\$ ]', ' ', text, flags=re.MULTILINE)

    def _denoise_text(self, text):
        text = self._remove_https_links(text)
        text = self._replace_non_alphanumeric(text)
        return text

    def _replace_contractions(self, text):
        return contractions.fix(text)

    def _normalize(self, text):
        text = self._denoise_text(text)
        text = self._replace_contractions(text)
        return text.lower()
                    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for features in self.variables:
            X[features] = X[features].apply(self._normalize)
        return X

In [125]:
cleaner = TextCleaner('text')

In [127]:
df = cleaner.transform(cleaned_df)

In [129]:
STOPWORD_EXCEPTIONS = ["whatever", "whenever", "about", "nothing", "empty", "none", "more", "somewhere", "most", "not", "never"]

In [130]:
import spacy    
nlp = spacy.load("en_core_web_sm")
nlp.Defaults.stop_words -= set(STOPWORD_EXCEPTIONS)

In [131]:
class TextTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
            
    def _lemmatize_and_remove_stop_words(self, text):
        return [t.lemma_ for t in nlp(text) if not t.is_stop and len(t.lemma_) > 1]
    
    def _remove_non_ascii(self, words):
        """Remove non-ASCII characters from list of tokenized words."""
        new_words = []
        for word in words:
            new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_words.append(new_word)
        return new_words
    
    def _replace_numbers(self, words):
        """Replace all integer occurrences in list of tokenized words with textual representation"""
        p = inflect.engine()
        new_words = []
        for word in words:
            if word.isdigit():
                new_word = p.number_to_words(word)
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words
    
    def _normalize(self, text):
        words = self._lemmatize_and_remove_stop_words(text)
        words = self._remove_non_ascii(words)
        words = self._replace_numbers(words)
        return ' '.join(words)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for variable in self.variables:
            X[variable] = X[variable].apply(self._normalize)
        return X

In [134]:
class DropUnnecessaryFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, variables_to_drop=None):
        self.variables = variables_to_drop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X = X.drop(self, self.variables, axis=1)
        return X

In [132]:
tokenizer = TextTokenizer('text')

In [133]:
df = tokenizer.transform(df)

In [143]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [144]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=1500)

In [150]:
X_train = vectorizer.fit_transform(train_df['text'])

In [155]:
X_test = vectorizer.transform(test_df['text'])

In [146]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [153]:
rf_model.fit(X_train, train_df['label'])

RandomForestClassifier(random_state=42)

In [157]:
y_pred = rf_model.predict(X_test)

In [158]:
y_true = test_df['label']

In [159]:
from sklearn.metrics import classification_report

In [161]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

    bad test       0.40      0.12      0.19        16
         bug       0.00      0.00      0.00         4
  live convo       1.00      0.82      0.90        11
      mobile       0.80      0.44      0.57         9
     no test       0.00      0.00      0.00         3
       other       0.36      0.91      0.52        23
     payment       0.00      0.00      0.00         5
     ratings       0.80      0.73      0.76        11
    recorder       0.62      0.45      0.53        11
   screeners       1.00      0.67      0.80         6

    accuracy                           0.54        99
   macro avg       0.50      0.41      0.43        99
weighted avg       0.55      0.54      0.49        99



  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
n_test = int(len(df)*0.2)

In [138]:
train_df, test_df = df.iloc[n_test:, :], df.iloc[:n_test, :]

In [None]:
variables_to_drop = ['title', 'score', 'num_comments', 'created_at', 'text']

In [87]:
train_df['label'].value_counts()

other            121
screeners         46
bad test          39
ratings           32
recorder          31
live convo        30
no test           29
mobile            28
bug               25
payment           15
other company      1
Name: label, dtype: int64