In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def get_base(dfngram, ngram = 2):
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]
    ASET = set()
    for i in xrange(len(df)):
        ASET.add(tuple(df.iloc[i]))
    
    return ASET

In [3]:
def get_markov(dfngram, ngram = 2):
    
    if ngram == 1:
        base = set(range(1,101))
    else:
        base = get_base(dfngram, ngram = ngram)
        
    xy_table = pd.DataFrame(data = np.zeros([len(base), 101]), index=base, columns=range(1,102))
    x_table = pd.DataFrame(data = np.zeros([len(base), 1]), index=base, columns= ['target'])
    
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]
    target = dfngram['target'].values
    
    for i in range(len(df)):
        if ngram == 1:
            t = df.iloc[i].values[0]
        else:
            t = tuple(df.iloc[i])
            
        s = target[i]
        xy_table.ix[t,s] += 1
        x_table.ix[t,"target"] += 1
    
    prob_value = np.array(xy_table)/np.array(x_table)
    p_table = pd.DataFrame(data = prob_value, index=base, columns=range(1,102))
    
    p_table["base"] = p_table.index
    p_xy = p_table[range(1,102)].values
    p_y = np.array(range(1,102))
    p_table['pred'] = np.dot(p_xy,p_y)
    
    #return p_table, xy_table, x_table, base
    return p_table, base
    

In [4]:
def get_mse_7(dfngram, p_table7, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1, \
              base7, base6, base5, base4, base3, base2, base1):
    
    ngram = 7
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_7(basei, p_table7, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1, \
              base7, base6, base5, base4, base3, base2, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    return np.sqrt(np.mean(mse_list**2))


In [5]:
def get_mse_6(dfngram, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base6, base5, base4, base3, base2, base1):
    
    ngram = 6
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_6(basei, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base6, base5, base4, base3, base2, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))


In [6]:
def get_mse_5(dfngram, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base5, base4, base3, base2, base1):
    
    ngram = 5
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_5(basei, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base5, base4, base3, base2, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [7]:
def get_mse_4(dfngram, p_table4, p_table3, p_table2, p_table1, base4, base3, base2, base1):
    
    ngram = 4
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_4(basei, p_table4, p_table3, p_table2, p_table1, base4, base3, base2, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [8]:
def get_mse_3(dfngram, p_table3, p_table2, p_table1, base3, base2, base1):
    
    ngram = 3
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_3(basei, p_table3, p_table2, p_table1, base3, base2, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [9]:
def get_mse_2(dfngram, p_table2, p_table1, base2, base1):
    
    ngram = 2
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_2(basei, p_table2, p_table1, base2, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [10]:
def get_mse_1(dfngram, p_table1, base1):
    
    ngram = 1
    dfngram = dfngram.astype('int')
    col = ["diff_{}".format(i) for i in range(ngram)]
    df = dfngram[col]

    est_list = []
    diff_list = []
    for i in range(len(df)):
        basei = tuple(df.iloc[i])
        esti = get_est_1(basei, p_table1, base1)
        est_list.append(esti)
        diff_list.append(dfngram['target'].values[i] - esti)
    
    dfngram['est'] = est_list
    dfngram['diff'] = diff_list
    mse_list = np.array(diff_list)
    
    return np.sqrt(np.mean(mse_list**2))

In [11]:
def get_est_7(basei, p_table7, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1, \
              base7, base6, base5, base4, base3, base2, base1):
    
    if basei in base7:
        return p_table7[p_table7['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base6:
            return p_table6[p_table6['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base5:
                return p_table5[p_table5['base']==basei[2:]]['pred'].values[0]
            else: 
                if basei[3:] in base4:
                    return p_table4[p_table4['base']==basei[3:]]['pred'].values[0]
                else:
                    if basei[4:] in base3:
                        return p_table3[p_table3['base']==basei[4:]]['pred'].values[0]
                    else:
                        if basei[5:] in base2:
                            return p_table2[p_table2['base']==basei[5:]]['pred'].values[0]
                        else:
                            return p_table1[p_table1['base']==basei[6]]['pred'].values[0]

In [12]:
def get_est_6(basei, p_table6, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base6, base5, base4, base3, base2, base1):
    
    if basei in base6:
        return p_table6[p_table6['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base5:
            return p_table5[p_table5['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base4:
                return p_table4[p_table4['base']==basei[2:]]['pred'].values[0]
            else: 
                if basei[3:] in base3:
                    return p_table3[p_table3['base']==basei[3:]]['pred'].values[0]
                else:
                    if basei[4:] in base2:
                        return p_table2[p_table2['base']==basei[4:]]['pred'].values[0]
                    else:
                        return p_table1[p_table1['base']==basei[5]]['pred'].values[0]

In [13]:
def get_est_5(basei, p_table5, p_table4, p_table3, p_table2, p_table1,\
              base5, base4, base3, base2, base1):
    
    if basei in base5:
        return p_table5[p_table5['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base4:
            return p_table4[p_table4['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base3:
                return p_table3[p_table3['base']==basei[2:]]['pred'].values[0]
            else: 
                if basei[3:] in base2:
                    return p_table2[p_table2['base']==basei[3:]]['pred'].values[0]
                else:
                    return p_table1[p_table1['base']==basei[4]]['pred'].values[0]

In [14]:
def get_est_4(basei, p_table4, p_table3, p_table2, p_table1, base4, base3, base2, base1):
    
    if basei in base4:
        return p_table4[p_table4['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base3:
            return p_table3[p_table3['base']==basei[1:]]['pred'].values[0]
        else:
            if basei[2:] in base2:
                return p_table2[p_table2['base']==basei[2:]]['pred'].values[0]
            else: 
                return p_table1[p_table1['base']==basei[3]]['pred'].values[0]

In [15]:
def get_est_3(basei, p_table3, p_table2, p_table1, base3, base2, base1):
    
    if basei in base3:
        return p_table3[p_table3['base']==basei]['pred'].values[0]
    else:
        if basei[1:] in base2:
            return p_table2[p_table2['base']==basei[1:]]['pred'].values[0]
        else:
            return p_table1[p_table1['base']==basei[2]]['pred'].values[0]

In [16]:
def get_est_2(basei, p_table2, p_table1, base2, base1):
    
    if basei in base2:
        return p_table2[p_table2['base']==basei]['pred'].values[0]
    else:
        return p_table1[p_table1['base']==basei[1]]['pred'].values[0]

In [17]:
def get_est_1(basei, p_table1, base1):
    
    basei = basei[0]
    return p_table1[p_table1['base']==basei]['pred'].values[0]
    

In [70]:
def get_cv(dfngram1, dfngram2, dfngram3, dfngram4, dfngram5, dfngram6, dfngram7):
    
    num_kfold = 5

    kf1 = KFold(len(dfngram1), num_kfold, shuffle=True)
    kf2 = KFold(len(dfngram2), num_kfold, shuffle=True)
    kf3 = KFold(len(dfngram3), num_kfold, shuffle=True)
    kf4 = KFold(len(dfngram4), num_kfold, shuffle=True)
    kf5 = KFold(len(dfngram5), num_kfold, shuffle=True)
    kf6 = KFold(len(dfngram6), num_kfold, shuffle=True)
    kf7 = KFold(len(dfngram7), num_kfold, shuffle=True)

    kf = zip(kf1, kf2, kf3, kf4, kf5, kf6, kf7)
    
    list_mse_train = []
    list_mse_test = []
    
    for index1, index2, index3, index4, index5, index6, index7 in kf:
        train_index1, test_index1 = index1
        train_index2, test_index2 = index2
        train_index3, test_index3 = index3
        train_index4, test_index4 = index4
        train_index5, test_index5 = index5
        train_index6, test_index6 = index6
        train_index7, test_index7 = index7
        
        dfngram1_train = dfngram1.iloc[train_index1]
        dfngram1_test = dfngram1.iloc[test_index1]
        dfngram2_train = dfngram2.iloc[train_index2]
        dfngram2_test = dfngram2.iloc[test_index2]
        dfngram3_train = dfngram3.iloc[train_index3]
        dfngram3_test = dfngram3.iloc[test_index3]
        dfngram4_train = dfngram4.iloc[train_index4]
        dfngram4_test = dfngram4.iloc[test_index4]
        dfngram5_train = dfngram5.iloc[train_index5]
        dfngram5_test = dfngram5.iloc[test_index5]
        dfngram6_train = dfngram6.iloc[train_index6]
        dfngram6_test = dfngram6.iloc[test_index6]
        dfngram7_train = dfngram7.iloc[train_index7]
        dfngram7_test = dfngram7.iloc[test_index7]
        
        p_table_train7, base_train7 = get_markov(dfngram7_train, ngram = 7)
        p_table_train6, base_train6 = get_markov(dfngram6_train, ngram = 6)
        p_table_train5, base_train5 = get_markov(dfngram5_train, ngram = 5)
        p_table_train4, base_train4 = get_markov(dfngram4_train, ngram = 4)
        p_table_train3, base_train3 = get_markov(dfngram3_train, ngram = 3)
        p_table_train2, base_train2 = get_markov(dfngram2_train, ngram = 2)
        p_table_train1, base_train1 = get_markov(dfngram1_train, ngram = 1)
        
        mse_train7 = get_mse_7(dfngram7_train, p_table_train7, p_table_train6, p_table_train5, p_table_train4, \
                              p_table_train3, p_table_train2, p_table_train1, \
                              base_train7, base_train6, base_train5, base_train4, \
                              base_train3, base_train2, base_train1)
        mse_test7 = get_mse_7(dfngram7_test, p_table_train7, p_table_train6, p_table_train5, p_table_train4, \
                              p_table_train3, p_table_train2, p_table_train1, \
                              base_train7, base_train6, base_train5, base_train4, \
                              base_train3, base_train2, base_train1)
    
        mse_train6 = get_mse_6(dfngram6_train, p_table_train6, p_table_train5, p_table_train4, \
                              p_table_train3, p_table_train2, p_table_train1, \
                              base_train6, base_train5, base_train4, \
                              base_train3, base_train2, base_train1)
        mse_test6 = get_mse_6(dfngram6_test, p_table_train6, p_table_train5, p_table_train4, \
                              p_table_train3, p_table_train2, p_table_train1, \
                              base_train6, base_train5, base_train4, \
                              base_train3, base_train2, base_train1)
        
        mse_train5 = get_mse_5(dfngram5_train, p_table_train5, p_table_train4, \
                              p_table_train3, p_table_train2, p_table_train1, \
                              base_train5, base_train4, \
                              base_train3, base_train2, base_train1)
        mse_test5 = get_mse_5(dfngram5_test, p_table_train5, p_table_train4, \
                              p_table_train3, p_table_train2, p_table_train1, \
                              base_train5, base_train4, \
                              base_train3, base_train2, base_train1)
        
        mse_train4 = get_mse_4(dfngram4_train, p_table_train4, p_table_train3, p_table_train2, p_table_train1, \
                              base_train4, base_train3, base_train2, base_train1)
        mse_test4 = get_mse_4(dfngram4_test, p_table_train4, p_table_train3, p_table_train2, p_table_train1, \
                              base_train4, base_train3, base_train2, base_train1)
        
        mse_train3 = get_mse_3(dfngram3_train, p_table_train3, p_table_train2, p_table_train1, \
                              base_train3, base_train2, base_train1)
        mse_test3 = get_mse_3(dfngram3_test, p_table_train3, p_table_train2, p_table_train1, \
                              base_train3, base_train2, base_train1)
        
        mse_train2 = get_mse_2(dfngram2_train, p_table_train2, p_table_train1, base_train2, base_train1)
        mse_test2 = get_mse_2(dfngram2_test, p_table_train2, p_table_train1, base_train2, base_train1)
        
        mse_train1 = get_mse_1(dfngram1_train, p_table_train1, base_train1)
        mse_test1 = get_mse_1(dfngram1_test, p_table_train1, base_train1)
        
        mse_train = [mse_train1, mse_train2, mse_train3, mse_train4, mse_train5, mse_train6, mse_train7] 
        mse_test = [mse_test1, mse_test2, mse_test3, mse_test4, mse_test5, mse_test6, mse_test7]
        
        list_mse_train.append(mse_train)
        list_mse_test.append(mse_test)
    
    col_name = ['ngram_2','ngram_3','ngram_4','ngram_5', 'ngram_6','ngram_7','ngram_8']  
    mse_train = pd.DataFrame(data = list_mse_train)
    mse_test = pd.DataFrame(data = list_mse_test)   
    mse_train.columns = col_name
    mse_test.columns = col_name
    
    mse_train.to_csv("../data/train_cv", sep=',', encoding='utf-8')
    mse_test.to_csv("../data/test_cv", sep=',', encoding='utf-8')
    
    return mse_train, mse_test

In [19]:

'''
dfngram1 = pd.read_csv('../data/small_101_1.csv')
dfngram2 = pd.read_csv('../data/small_101_2.csv')
dfngram3 = pd.read_csv('../data/small_101_3.csv')
dfngram4 = pd.read_csv('../data/small_101_4.csv')
dfngram5 = pd.read_csv('../data/small_101_5.csv')
dfngram6 = pd.read_csv('../data/small_101_6.csv')
dfngram7 = pd.read_csv('../data/small_101_7.csv')
'''


In [136]:

dfngram1 = pd.read_csv('../data/ngram101_1.csv')
dfngram2 = pd.read_csv('../data/ngram101_2.csv')
dfngram3 = pd.read_csv('../data/ngram101_3.csv')
dfngram4 = pd.read_csv('../data/ngram101_4.csv')
dfngram5 = pd.read_csv('../data/ngram101_5.csv')
dfngram6 = pd.read_csv('../data/ngram101_6.csv')
dfngram7 = pd.read_csv('../data/ngram101_7.csv')


In [146]:
1+1

2

In [None]:
mse_train, mse_test = get_cv(dfngram1, dfngram2, dfngram3, dfngram4, dfngram5, dfngram6, dfngram7)
mse_mean = pd.DataFrame(data = np.array([list(mse_train.mean()), list(mse_test.mean())]).T)
mse_mean.index = mse_train.columns
mse_mean.columns = ['cv_train', 'cv_test']

ax1 = mse_mean.plot(lw=2, figsize=(10,6))
ax1.set_title("cv_mean over ngram",fontsize=20)
ax1.set_xlabel("ngram",fontsize=18)
ax1.set_ylabel("mse",fontsize=18)
plt.savefig("../data/cv_train_test2.png")
#plt.savefig("../data/cv_train_test2_small.png")
plt.show()


In [None]:
1+1