In [23]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold

In [2]:
def get_base(dfngram, ngram = 2):
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]
    ASET = set()
    for i in xrange(len(df)):
        ASET.add(tuple(df.iloc[i]))
    
    return ASET

In [3]:
def get_markov(dfngram, ngram = 2):
    
    if ngram == 1:
        base = set(range(1,101))
    else:
        base = get_base(dfngram, ngram = ngram)
        
    xy_table = pd.DataFrame(data = np.zeros([len(base), 101]), index=base, columns=range(1,102))
    x_table = pd.DataFrame(data = np.zeros([len(base), 1]), index=base, columns= ['target'])
    
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]
    target = dfngram['target'].values
    
    for i in range(len(df)):
        if ngram == 1:
            t = df.iloc[i].values[0]
        else:
            t = tuple(df.iloc[i])
            
        s = target[i]
        xy_table.ix[t,s] += 1
        x_table.ix[t,"target"] += 1
    
    prob_value = np.array(xy_table)/np.array(x_table)
    p_table = pd.DataFrame(data = prob_value, index=base, columns=range(1,102))
    
    p_table["base"] = p_table.index
    p_xy = p_table[range(1,102)].values
    p_y = np.array(range(1,102))
    p_table['pred'] = np.dot(p_xy,p_y)
    
    return p_table, xy_table, x_table, base
    

In [4]:
def get_mse_7(dfngram, p_table7, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1, \
              base7, base6, base5, base4, base3, base2, base1):
    
    ngram = 7
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_7(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))


In [5]:
def get_mse_6(dfngram, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base6, base5, base4, base3, base2, base1):
    
    ngram = 6
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_6(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))


In [6]:
def get_mse_5(dfngram, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base5, base4, base3, base2, base1):
    
    ngram = 5
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_5(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [7]:
def get_mse_4(dfngram, p_table4, p_table3, p_table2, p_table1, base4, base3, base2, base1):
    
    ngram = 4
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_4(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [8]:
def get_mse_3(dfngram, p_table3, p_table2, p_table1, base3, base2, base1):
    
    ngram = 3
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_3(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [9]:
def get_mse_2(dfngram, p_table2, p_table1, base2, base1):
    
    ngram = 2
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_2(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [10]:
def get_mse_1(dfngram, p_table1, base1):
    
    ngram = 1
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_1(basei)
        est_list.append(esti)
        diff_list.append(dfngram['target'][i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [11]:
def get_est_7(basei):
    
    if basei in base7:
        return p_table7[p_table7['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base6:
            return p_table6[p_table6['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base5:
                return p_table5[p_table5['base']==basei[2:]]['pred'].values[0]
            else: 
                if basei[3:] in base4:
                    return p_table4[p_table4['base']==basei[3:]]['pred'].values[0]
                else:
                    if basei[4:] in base3:
                        return p_table3[p_table3['base']==basei[4:]]['pred'].values[0]
                    else:
                        if basei[5:] in base2:
                            return p_table2[p_table2['base']==basei[5:]]['pred'].values[0]
                        else:
                            return p_table1[p_table1['base']==basei[6]]['pred'].values[0]

In [12]:
def get_est_6(basei):
    
    if basei in base6:
        return p_table6[p_table6['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base5:
            return p_table5[p_table5['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base4:
                return p_table4[p_table4['base']==basei[2:]]['pred'].values[0]
            else: 
                if basei[3:] in base3:
                    return p_table3[p_table3['base']==basei[3:]]['pred'].values[0]
                else:
                    if basei[4:] in base2:
                        return p_table2[p_table2['base']==basei[4:]]['pred'].values[0]
                    else:
                        return p_table1[p_table1['base']==basei[5]]['pred'].values[0]

In [13]:
def get_est_5(basei):
    
    if basei in base5:
        return p_table5[p_table5['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base4:
            return p_table4[p_table4['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base3:
                return p_table3[p_table3['base']==basei[2:]]['pred'].values[0]
            else: 
                if basei[3:] in base2:
                    return p_table2[p_table2['base']==basei[3:]]['pred'].values[0]
                else:
                    return p_table1[p_table1['base']==basei[4]]['pred'].values[0]

In [14]:
def get_est_4(basei):
    
    if basei in base4:
        return p_table4[p_table4['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base3:
            return p_table3[p_table3['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base2:
                return p_table2[p_table2['base']==basei[2:]]['pred'].values[0]
            else: 
                return p_table1[p_table1['base']==basei[3]]['pred'].values[0]

In [15]:
def get_est_3(basei):
    
    if basei in base3:
        return p_table3[p_table3['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base2:
            return p_table2[p_table2['base']==basei[1:]]['pred'].values[0]
        else:
            return p_table1[p_table1['base']==basei[2]]['pred'].values[0]

In [16]:
def get_est_2(basei):
    
    if basei in base2:
        return p_table2[p_table2['base']==basei]['pred'].values[0]
    else:
        return p_table1[p_table1['base']==basei[1]]['pred'].values[0]

In [17]:
def get_est_1(basei):
    
    basei = basei[0]
    return p_table1[p_table1['base']==basei]['pred'].values[0]
    

In [24]:
dfngram1 = pd.read_csv('../data/small_101_1.csv')
dfngram2 = pd.read_csv('../data/small_101_2.csv')
dfngram3 = pd.read_csv('../data/small_101_3.csv')
dfngram4 = pd.read_csv('../data/small_101_4.csv')
dfngram5 = pd.read_csv('../data/small_101_5.csv')
dfngram6 = pd.read_csv('../data/small_101_6.csv')
dfngram7 = pd.read_csv('../data/small_101_7.csv')

In [None]:
'''
dfngram1 = pd.read_csv('../data/ngram101_1.csv')
dfngram2 = pd.read_csv('../data/ngram101_2.csv')
dfngram3 = pd.read_csv('../data/ngram101_3.csv')
dfngram4 = pd.read_csv('../data/ngram101_4.csv')
dfngram5 = pd.read_csv('../data/ngram101_5.csv')
dfngram6 = pd.read_csv('../data/ngram101_6.csv')
dfngram7 = pd.read_csv('../data/ngram101_7.csv')
'''

In [25]:
p_table1, xy_table1, x_table1, base1 = get_markov(dfngram1, ngram = 1)
p_table2, xy_table2, x_table2, base2 = get_markov(dfngram2, ngram = 2)
p_table3, xy_table3, x_table3, base3 = get_markov(dfngram3, ngram = 3)
p_table4, xy_table4, x_table4, base4 = get_markov(dfngram4, ngram = 4)
p_table5, xy_table5, x_table5, base5 = get_markov(dfngram5, ngram = 5)
p_table6, xy_table6, x_table6, base6 = get_markov(dfngram6, ngram = 6)
p_table7, xy_table7, x_table7, base7 = get_markov(dfngram7, ngram = 7)

In [22]:
mse7 = get_mse_7(dfngram7, p_table7, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
          base7, base6, base5, base4, base3, base2, base1
mse6 = get_mse_6(dfngram6, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
          base6, base5, base4, base3, base2, base1)
mse5 = get_mse_5(dfngram5, p_table5, p_table4, p_table3, p_table2, p_table1,\
                 base5, base4, base3, base2, base1)
mse4 = get_mse_4(dfngram4, p_table4, p_table3, p_table2, p_table1,\
                 base4, base3, base2, base1)
mse3 = get_mse_3(dfngram3, p_table3, p_table2, p_table1,\
                 base3, base2, base1)
mse2 = get_mse_2(dfngram2, p_table2, p_table1,\
                 base2, base1)
mse1 = get_mse_1(dfngram1, p_table1, base1)

print "mse7 : {}".format(mse7)
print "mse6 : {}".format(mse6)
print "mse5 : {}".format(mse5)
print "mse4 : {}".format(mse4)
print "mse3 : {}".format(mse3)
print "mse2 : {}".format(mse2)
print "mse1 : {}".format(mse1)


mse7 : 0.0
mse6 : 0.0
mse5 : 0.0
mse4 : 0.14239633969
mse3 : 0.931827747425
mse2 : 4.5249658132
mse1 : 11.497620338


'\nmse6 : 0.367626053399\nmse5 : 0.844756406342\nmse4 : 2.45499298668\nmse3 : 6.26906420524\nmse2 : 8.92988218392\nmse1 : 11.2127767336\n'

In [33]:
def get_cv_7(dfngram):
    
    num_kfold = 5

    kf = KFold(len(dfngram), num_kfold)

    list_mse = []
    for train_index, test_index in kf:
        dfngram_train = dfngram.iloc[train_index]
        dfngram_test = dfngram.iloc[test_index]
        p_table, xy_table, x_table, base = get_markov(dfngram_train, ngram = 7)
        
        mse = get_mse_7(dfngram_test, p_table, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
          base, base6, base5, base4, base3, base2, base1)
        
        list_mse.append(mse)
                        
    return np.mean(np.array(list_mse))
  

In [34]:
get_cv_7(dfngram7)

KeyError: 0