In [83]:
###################################################################################
# Import modules

import nltk as nltk
import numpy as np
import pandas as pd
import collections as coll

# Hyperpameters for n-grams:
#   - n = 1:4
#   - multimap = [True, False]
#   - low-count cutoff
#   - 

###################################################################################
# Extract n-grams from text snippit
#
# Input:
#   - String s i.e. "okok"
#   - Integer n i.e. 2
# Output:
#   - List<String> i.e. ["ok","ok"]

def string_to_ngrams(s, n = 1):
    text = str(s).decode('utf-8').lower()
    text = text.replace(' ', '') # remove spaces
    ngrams = nltk.ngrams([c for c in text], n)
    return [''.join(g) for g in ngrams]

###################################################################################
# Produce new representation of data such that each n-gram is associated with 
# the number of times it is observed in a text of a particular language.
#
# Input:
#   - pd.DataFrame train_set
#   - Boolean multimap: True if an n-gram is counted multiple times per text, and False
#                       if an n-gram is counted only once per text.
# Output:
#   - pd.DataFrame

# I can think of two ways to encode using n-grams.
#
# Method #1: For each language, calculate the # of texts in which each n-gram
#            has appeared.  This means that an n-gram is counted <once> per text.
#
# Method #2: For each language, calculate the # of occurrences of each n-gram across
#            all texts.  This means that an n-gram is counted <multiple times> per text.

def set_to_df(train_set, multimap = True, n = 1, verbose = False):
    res = {}
    # Construct hash of arrays.
    for index, row in train_set.iterrows():
        # Code the language of the observation
        category = np.array([0, 0, 0, 0, 0])
        category[row['Category']] = 1
        # Break the text into n-grams
        ngrams = string_to_ngrams(row['Text'], n = 1)
        if not multimap:
            ngrams = list(set(ngrams))
        for ngram in ngrams:
            if ngram in res:
                # Sum element-wise with entries.
                res[ngram] = res[ngram] + category # for some reason += works by reference
            else:
                res[ngram] = category
            if verbose:
                print("%s:%s" % (ngram, res[ngram]))
    # Convert into data frame   
    return pd.DataFrame(res).transpose()

###################################################################################
# Filter low-count occurrences to remove noise n-grams.
#
# Input:
#   - pd.DataFrame df
#   - Integer count_threshold
# Output:
#   - pd.DataFrame

def filter_df(df, count_threshold):
    keep = df.apply(lambda row: sum(row) >= count_threshold, axis = 1)
    return df[keep == 1]

###################################################################################
# Normalize occurrences to avoid class imbalances.
#
# Input:
#   - pd.DataFrame df
#   - pd.DataFrame train_set_y
# Output:
#   - pd.DataFrame

def normalize_df(df, train_set_y):
    totals = coll.Counter(train_set_y['Category'])
    for colname in totals:
        df[colname] = df[colname].apply(lambda x: 1. * x / totals[colname])
    return df

In [91]:
# Load data

train_set_x = pd.read_csv("data/train_set_x.csv")
train_set_y = pd.read_csv("data/train_set_y.csv")
train_set = pd.merge(train_set_x, train_set_y, on = 'Id')
test_set_x = pd.read_csv("data/test_set_x.csv")

# Split into n-grams

tmt = set_to_df(train_set[:1000], True)[:10]
tmf = set_to_df(train_set[:1000], False)

In [92]:
tmf

Unnamed: 0,0,1,2,3,4
0,1,17,5,5,0
1,1,38,7,7,1
2,2,27,5,6,0
3,0,21,4,3,0
4,1,10,1,3,0
5,0,28,5,2,0
6,0,21,4,3,0
7,0,24,1,4,0
8,0,12,2,1,0
9,0,21,2,2,0


In [93]:
tmt

Unnamed: 0,0,1,2,3,4
0,1,22,7,11,0
1,2,62,7,9,1
2,2,41,6,12,0
3,0,27,6,5,0
4,1,15,3,11,0
5,0,37,6,5,0
6,0,28,4,6,0
7,0,27,1,7,0
8,0,16,3,1,0
9,0,30,3,3,0


In [94]:
tmtf = filter_df(tmt, 40)
tmtf

Unnamed: 0,0,1,2,3,4
0,1,22,7,11,0
1,2,62,7,9,1
2,2,41,6,12,0
5,0,37,6,5,0


In [95]:
tmtfn = normalize_df(tmt, train_set_y)
tmtfn

Unnamed: 0,0,1,2,3,4
0,7.1e-05,0.000156,0.0001,0.000297,0.0
1,0.000141,0.000439,0.0001,0.000243,7.1e-05
2,0.000141,0.00029,8.6e-05,0.000324,0.0
3,0.0,0.000191,8.6e-05,0.000135,0.0
4,7.1e-05,0.000106,4.3e-05,0.000297,0.0
5,0.0,0.000262,8.6e-05,0.000135,0.0
6,0.0,0.000198,5.7e-05,0.000162,0.0
7,0.0,0.000191,1.4e-05,0.000189,0.0
8,0.0,0.000113,4.3e-05,2.7e-05,0.0
9,0.0,0.000212,4.3e-05,8.1e-05,0.0
