In [7]:
import pandas as pd
import os
import pickle
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

def read_data(name,k):
    os.chdir("C:\Spring_2017\machine_learning\DS1003_Final_Project")
    dataset = pickle.load( open( "pos_" + name + "_2.p", "rb" ) )
    Y = pickle.load( open( "tar_" + name + "_2.p", "rb" ) )

    dataset.columns = ['p-2','p-1','p0','p+1','p+2']
    dataset['Y'] = Y
    dataset['flag'] = (dataset=="None").sum(axis=1)<k
    dataset = dataset[dataset['Y'].str.contains("\.")]
    dataset = dataset[dataset['flag'] == True]

    lab = 'Y'
    Y = dataset[lab]
    X = dataset.drop(['flag'], 1)
    X =  X.apply(LabelEncoder().fit_transform)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    Y_train = pd.Series(Y_train)    
    Y_test = pd.Series(Y_test)
    return X_train, X_test, Y_train, Y_test

In [3]:
# Baseline accuracy
word_list = ['act','be','have','form','high','play','report','state','use','sum']
for word in word_list:
    Y = pickle.load( open( "tar_" + word + ".p", "rb" ) )
    print(" ",word,": ",Y.describe()[3]/Y.describe()[0])

  act :  0.217391304348
  be :  0.594564145895
  have :  0.354166666667
  form :  0.330935251799
  high :  0.465838509317
  play :  0.306569343066
  report :  0.431034482759
  state :  0.870498084291
  use :  0.780429594272
  sum :  0.486486486486


In [8]:
from sklearn import tree
import numpy as np
from sklearn.model_selection import GridSearchCV

def tree_method(X_train, Y_train, X_test, Y_test):
    param_grid = {'max_depth': np.arange(2,30)}
    svr = tree.DecisionTreeClassifier()
    clf = GridSearchCV(svr, param_grid )
    clf.fit(X_train, Y_train)

    tree_preds = clf.predict(X_test)
    score = clf.score(X_test, Y_test)
    return tree_preds, score

In [9]:
def tree_method_on_wordlist(word_list):
    score_dict={}
    cross_dict = {}
    for word in word_list:
        X_train, X_test, Y_train, Y_test = read_data(word,100)
        tree_preds, score = tree_method(X_train, Y_train, X_test, Y_test)
        df = pd.DataFrame()
        df['truth'] = Y_test
        df['predict'] = tree_preds
        cross_dict[word] = pd.crosstab(df['truth'], df['predict'], margins=True)
        score_dict[word] = score
    print(sum(score_dict.values())/len(word_list))
    return cross_dict, score_dict

In [10]:
word_list = ['act','be','have','form','high','play','report','state','use','sum']
cross_dict, score_dict = tree_method_on_wordlist(word_list)



0.975116350597


In [12]:
score_dict

{'act': 1.0,
 'be': 0.99920634920634921,
 'form': 0.95999999999999996,
 'have': 1.0,
 'high': 0.92000000000000004,
 'play': 0.88461538461538458,
 'report': 1.0,
 'state': 1.0,
 'sum': 1.0,
 'use': 0.98734177215189878}

In [13]:
from sklearn.svm import SVC
from sklearn import tree
import numpy as np
from sklearn.model_selection import GridSearchCV

def svm_method(X_train, Y_train, X_test, Y_test):
    param_grid = {'C': [10**i for i in range(-4,4)]}
    svr = SVC(kernel='linear')
    clf = GridSearchCV(svr, param_grid )
    clf.fit(X_train, Y_train)

    svm_preds = clf.predict(X_test)
    score = clf.score(X_test, Y_test)
    return svm_preds, score
#svm_method(X_train, Y_train, X_test, Y_test)

In [57]:
def svm_method_on_wordlist(word_list):
    score_dict={}
    cross_dict = {}
    for word in word_list:
        X_train, X_test, Y_train, Y_test = read_data(word,1)
        svm_preds, score = svm_method(X_train, Y_train, X_test, Y_test)
        df = pd.DataFrame()
        df['truth'] = Y_test
        df['predict'] = svm_preds
        cross_dict[word] = pd.crosstab(df['truth'], df['predict'], margins=True)
        score_dict[word] = score
    print(sum(score_dict.values())/len(word_list))
    return cross_dict, score_dict
cross_dict1, score_dict1 = svm_method_on_wordlist(word_list)



0.849171559273


In [59]:
score_dict

{'act': 1.0,
 'be': 0.99920634920634921,
 'form': 0.95999999999999996,
 'have': 1.0,
 'high': 0.92000000000000004,
 'play': 0.88461538461538458,
 'report': 1.0,
 'state': 1.0,
 'sum': 1.0,
 'use': 0.98734177215189878}

In [54]:
def forest_method(X_train, Y_train, X_test, Y_test):
    param_grid = {'n_estimators': [10*i for i in range(1,10)]}
    svr = RandomForestClassifier() 
    clf = GridSearchCV(svr, param_grid )
    clf.fit(X_train, Y_train)

    svm_preds = clf.predict(X_test)
    score = clf.score(X_test, Y_test)
    return svm_preds, score

In [55]:
def forest_method_on_wordlist(word_list):
    score_dict={}
    cross_dict = {}
    for word in word_list:
        X_train, X_test, Y_train, Y_test = read_data(word,1)
        svm_preds, score = forest_method(X_train, Y_train, X_test, Y_test)
        df = pd.DataFrame()
        df['truth'] = Y_test
        df['predict'] = svm_preds
        cross_dict[word] = pd.crosstab(df['truth'], df['predict'], margins=True)
        score_dict[word] = score
    print(sum(score_dict.values())/len(word_list))
    return cross_dict, score_dict
cross_dict2, score_dict2 = forest_method_on_wordlist(word_list)



0.944873431688


In [56]:
score_dict2

{'act': 1.0,
 'be': 0.99661781285231121,
 'form': 0.94444444444444442,
 'have': 0.96212121212121215,
 'high': 0.875,
 'play': 0.9375,
 'report': 0.75,
 'state': 1.0,
 'sum': 1.0,
 'use': 0.98305084745762716}