In [2]:
# Import modules

import nltk as nltk
import numpy as np
import pandas as pd

# Load data

train_set_x = pd.read_csv("data/train_set_x.csv")
train_set_y = pd.read_csv("data/train_set_y.csv")
train_set = pd.merge(train_set_x, train_set_y, on = 'Id')
test_set_x = pd.read_csv("data/test_set_x.csv")

# Extract n-grams from text snippit
#
# Input:
#   - String Text i.e. "okok"
#   - Integer n i.e. 2
# Output:
#   - List<String> i.e. ["ok","ok"]

def string_to_ngrams(s, n = 1):
    text = str(s).decode('utf-8').lower()
    text = text.replace(' ', '') # remove spaces
    ngrams = nltk.ngrams([c for c in text], n)
    return [''.join(g) for g in ngrams]

# Produce new representation of data such that each n-gram is associated with 
# the number of times it is observed in a text of a particular language.
#
# Input:
#   - pd.DataFrame
# Output:
#   - pd.DataFrame

def set_to_df(train_set):
    res = {}

    # Construct hash of arrays.
    for index, row in train_set.iterrows():
        # Code the language of the observation
        category = np.array([0, 0, 0, 0, 0])
        category[row['Category']] = 1
        # Break the text into n-grams
        ngrams = string_to_ngrams(row['Text'])
        for ngram in ngrams:
            if ngram in res:
                # Sum element-wise with entries.
                res[ngram] = res[ngram] + category # for some reason += works by reference
            else:
                res[ngram] = category
            # print("%s:%s" % (ngram, res[ngram]))

    # Convert into data frame   
    return pd.DataFrame(res).transpose()

print (set_to_df(train_set))

        0       1       2       3      4
0     801    9994    4418    2283     33
1    1059   12882    3845    2662     29
2     942   10890    2972    1828     24
3     621    6850    2056    1167     18
4     426    5483    1566     623      8
5     503    7975    1812     832     19
6     353    7510    1520     931      8
7     322    9712    1421     639      6
8     300    4846    1340     427      5
9     209    7238    1493     558      4
a   68894  432046  276233  106409  42319
b   11096   71123   34663   29862   8654
c   13787  199831   97768   58828  18373
d   21160  168355   98984   71359  14894
e   57971  774177  326705  212411  39926
f    1278   65541   19328   20842    702
g    1291   50116   38467   35630   5510
h   10248   69899   41386   88287   4765
i   34899  396859  139463  129529  40568
j   15855   84473   15873    6655  13098
k   24474   11662    6736   20797  13144
l   22004  256817  114600   62402   8050
m   30672  173593   77376   45156  17927
n   33483  37517