### These code are written in python 2.6 and sklearn 0.12.1 to run on the server, more like a record in the jupyter notebook

### load the package needed

In [5]:
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_curve,auc
import numpy as np
import scipy as sp

### read the dataset (replace it with the real training later) and drop index col, review_id and votes_total

In [5]:
data = pd.read_csv("/home/lx557/final_getdummies_training.csv")

In [6]:
data.head(1)#see which should be dropped
data=data.drop([data.columns[0],'review_id','votes_total_user'],1)#should leave 31 columns
data=data.drop(data.columns[0],1)

In [None]:
##Shape(913199, 31)

### Split it into two parts, the first trains the model, the second picks up the best feature set

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(data,data['Label'], test_size=0.2)

In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
((730559, 31), (182640, 31), (730559,), (182640,))

In [None]:
X_train=pd.DataFrame(X_train,columns=data.columns)
X_test=pd.DataFrame(X_test,columns=data.columns)
X_1=X_train['text']

### Define metric functions and functions for getting output

In [None]:
def roc_auc_score(y_true, y_score, average="macro"): #since there is no roc_auc_score in sklearn 0.12.1
    if len(np.unique(y_true)) != 2:
        raise ValueError("Only one class present in y_true. ROC AUC score")
    fpr, tpr, tresholds = roc_curve(y_true, y_score)
    return auc(fpr, tpr)       

In [None]:
def plotROC(X_test_proba,Y_test,label_string):
    fpr, tpr, thresholds = roc_curve(Y_test, X_test_proba)
    roc_auc = auc(fpr, tpr)
    c = (np.random.rand(), np.random.rand(), np.random.rand())
    fig=plt.figure()
    plt.plot(fpr, tpr, color = c, label = label_string + ' (AUC = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC_{0}'.format(label_string))
    plt.legend(loc=4,prop={'size':10})
    plt.savefig('{0}.pdf'.format(label_string))
    return roc_auc

### Lr_model1_GridSearch(no cross_validation)_baseline (Using text only)

In [None]:
X_subtrain, X_subtest, Y_subtrain, Y_subtest = train_test_split(X_train,X_train['Label'], test_size=0.2)

In [None]:
X_subtrain.shape,X_subtest.shape,Y_subtrain.shape,Y_subtest.shape
(584447, 31), (146112, 31), (584447,), (146112,)

In [None]:
X_subtrain=pd.DataFrame(X_subtrain,columns=X_train.columns)
X_subtest=pd.DataFrame(X_subtest,columns=X_train.columns)

In [None]:
def Self_GridS(X_train, X_test, y_train, y_test):
    auc_score={}
    for b in parameters['vect__max_features']:
        for c in parameters['vect__ngram_range']:
            X_tr=X_train.copy()
            X_te=X_test.copy()
            vectorizer = CountVectorizer(max_features=b,ngram_range=c) 
            vectorizer.fit(X_tr)
            X_tr= vectorizer.transform(X_tr)
            X_te=vectorizer.transform(X_te)
            for d in parameters['tfidf__use_idf']:
                X_tr1=X_tr.copy()
                X_te1=X_te.copy()
                if d==True:
                    tfmodel=TfidfTransformer(use_idf=d)
                    tfmodel.fit(X_tr1)
                    X_tr1=tfmodel.transform(X_tr1)
                    X_te1=tfmodel.transform(X_te1)
                for e in parameters['lr__alpha']:
                    for f in parameters['lr__penalty']:
                        lrmodel=SGDClassifier(loss='log',alpha=e,penalty=f)
                        lrmodel.fit(X_tr1,y_train)
                        y_predict=lrmodel.predict_proba(X_te1)[:,1]
                        auc=roc_auc_score(y_test,y_predict)
                        auc_score['b={0},c={1},d={2},e={3},f={4}'.format(b,c,d,e,f)]=auc
    return auc_score

In [None]:
auc_score_lr1=Self_(X_subtrain['text'], X_subtest['text'], Y_subtrain, Y_subtest)

In [None]:
f = open('baseline_AUC.csv', 'w')
for key, value in auc_score_lr1.items():
    f.write('{0}, {1}\n'.format(key,value))
f.flush()

In [None]:
#then see the best option is: 
b=None	c=(1	 2)	d=False	e=0.01	f=l2	0.693732339

In [None]:
vectorizer=CountVectorizer(max_features=None,ngram_range=(1,2))
vectorizer.fit(X_1)
X_1=vectorizer.transform(X_1)
X_test1=vectorizer.transform(X_test['text'].copy())
X_subtrain_text=vectorizer.transform(X_subtrain['text'].copy())
X_subtest_text=vectorizer.transform(X_subtest['text'].copy())
lrmodel=SGDClassifier(loss='log',alpha=0.01,penalty='l2')
lrmodel.fit(X_1,Y_train)
prob1=lrmodel.predict_proba(X_test1)[:,1]
prob1_train=lrmodel.predict_proba(X_1)[:,1]
prob_train_forlr3=lrmodel.predict_proba(X_subtrain_text)[:,1]
prob_test_forlr3=lrmodel.predict_proba(X_subtest_text)[:,1]
np.savetxt('predict_proba_result_on_test_lr_baseline.txt',prob1)

In [None]:
lr1_auc= plotROC(prob1,Y_test, "Baseline_lr")

In [None]:
df_lr1=pd.DataFrame(X_test['text'].copy())
df_lr1['target']=X_test['Label']
df_lr1['prediction']=prob1
df_lr1['predict_class'] = pd.Series(df_lr1['prediction'] >= 0.5, dtype=int)
wrong_base_shouldbe_1=df_lr1[np.logical_and(df_lr1['target']==1, df_lr1['predict_class']==0)]
wrong_base_shouldbe_0=df_lr1[np.logical_and(df_lr1['target']==0, df_lr1['predict_class']==1)]
wrong_base_shouldbe_1.to_csv('baseline_shouldbe1.csv')
wrong_base_shouldbe_0.to_csv('baseline_shouldbe0.csv')

### Lr_model2_text_plus_rest (user/business_information)

In [None]:
### based on the performance on lr_model1, slightly extend the area of parameter
parameters1 = {
    'alpha':[10**i for i in range(-3, 4)],
    'penalty': ['l1', 'l2']
}

In [None]:
def Self_GridS2(X_train, X_test, y_train, y_test):
    auc_score={}
    X_train_other=sp.sparse.csr_matrix(X_train.drop(['text','Label'],1).astype(float))
    X_test_other=sp.sparse.csr_matrix(X_test.drop(['text','Label'],1).astype(float))
    X_train=sp.sparse.hstack([X_subtrain_text,X_train_other])
    X_test=sp.sparse.hstack([X_subtest_text,X_test_other])
    for a in parameters1['alpha']:
        for b in parameters1['penalty']:
            lrmodel=SGDClassifier(loss='log',alpha=a,penalty=b)
            lrmodel.fit(X_train,y_train)
            y_predict=lrmodel.predict_proba(X_test)[:,1]
            auc=roc_auc_score(y_test,y_predict)
            auc_score['a={0},b={1}'.format(a,b)]=auc
    return auc_score

In [None]:
auc_score_lr2=Self_GridS2(X_subtrain.copy(), X_subtest.copy(), Y_subtrain, Y_subtest)

In [None]:
f = open('baseline_plusrest_AUC.csv', 'w')
for key, value in auc_score_lr2.items():
    f.write('{0}, {1}\n'.format(key,value))
f.flush()

In [None]:
a=0.001,b=l2': 0.71789502413225736, 

In [None]:
X_2_text=X_1
X_2_other=sp.sparse.csr_matrix(X_train.drop(['text','Label'],1).astype(float))
X_2=sp.sparse.hstack([X_2_text,X_2_other]) #This is the variable set, suppose ‘a’ is the sparse matrix from CountVectorizer/balabala fit-transformed data.
X_2=sp.sparse.csr_matrix(X_2)

In [None]:
X_test_text=X_test1
X_test_other=sp.sparse.csr_matrix(X_test.drop(['text','Label'],1).astype(float))
X_test2=sp.sparse.hstack([X_test_text,X_test_other]) #This is the variable set, suppose ‘a’ is the sparse matrix from CountVectorizer/balabala fit-transformed data.
X_test2=sp.sparse.csr_matrix(X_test2)

In [None]:
lrmodel=SGDClassifier(loss='log',alpha=0.001,penalty='l2')#pick up the best result
lrmodel.fit(X_2,Y_train)

In [None]:
prob2=lrmodel.predict_proba(X_test2)[:,1]
np.savetxt('predict_proba_result_lr_plusrest.txt',prob2)
lr2_auc= plotROC(prob2,Y_test, "Baseline_lr_plusrest")

In [None]:
df_lr2=pd.DataFrame(X_test.copy())#X_test is the original dataset
df_lr2['prediction']=prob2
df_lr2['predict_class'] = pd.Series(df_lr2['prediction'] >= 0.5, dtype=int)
wrong_2_shouldbe_1=df_lr2[np.logical_and(df_lr2['Label']==1, df_lr2['predict_class']==0)]
wrong_2_shouldbe_0=df_lr2[np.logical_and(df_lr2['Label']==0, df_lr2['predict_class']==1)]
wrong_2_shouldbe_1.to_csv('baseline_plusrest_shouldbe1.csv')
wrong_2_shouldbe_0.to_csv('baseline_plusrest_shouldbe0.csv')

### Lr_model3_loop_lr

In [None]:
parameters2 = {
    'alpha':[10**i for i in range(-3, 4)],
    'penalty': ['l1', 'l2']
}

In [None]:
def Self_GridS3(X_1train, X_1test, y_train, y_test):
    auc_score={}
    X_train=X_1train
    X_test=X_1test
    X_train['Label_prob']=prob_train_forlr3
    X_test['Label_prob']=prob_test_forlr3
    X_train=X_train.drop(['text','Label'],1)
    X_test=X_test.drop(['text','Label'],1)
    X_train=sp.sparse.csr_matrix(X_train.astype(float))
    X_test=sp.sparse.csr_matrix(X_test.astype(float))
    for a in parameters2['alpha']:
        for b in parameters2['penalty']:
            lrmodel=SGDClassifier(loss='log',alpha=a,penalty=b)
            lrmodel.fit(X_train,y_train)
            y_predict=lrmodel.predict_proba(X_test)[:,1]
            auc=roc_auc_score(y_test,y_predict)
            auc_score['a={0},b={1}'.format(a,b)]=auc
    return auc_score

In [None]:
auc_score_lr3=Self_GridS3(X_subtrain.copy(), X_subtest.copy(), Y_subtrain, Y_subtest)

In [None]:
f = open('baseline_loop_AUC.csv', 'w')
for key, value in auc_score_lr3.items():
    f.write('{0}, {1}\n'.format(key,value))
f.flush()

In [None]:
X_3=X_train
X_3['Label_prob']=prob1_train
X_3=X_3.drop(['text','Label'],1)

In [None]:
X_test3=X_test
X_test3['Label_prob']=prob1
X_test3=X_test3.drop(['text','Label'],1)

In [None]:
lrmodel=SGDClassifier(loss='log',alpha=0.001,penalty='l2')#pick up the best result
lrmodel.fit(X_3,Y_train)

In [None]:
prob3=lrmodel.predict_proba(X_test3.astype(float))[:,1]   
lr3_auc= plotROC(prob3,Y_test, "Baseline_loop")

In [None]:
df_lr3=pd.DataFrame(X_test3.copy())#X_test is the original dataset
df_lr3['prediction']=prob3
df_lr3['target']=X_test['Label']
df_lr3['predict_class'] = pd.Series(df_lr3['prediction'] >= 0.5, dtype=int)
wrong_3_shouldbe_1=df_lr3[np.logical_and(df_lr3['target']==1, df_lr3['predict_class']==0)]
wrong_3_shouldbe_0=df_lr3[np.logical_and(df_lr3['target']==0, df_lr3['predict_class']==1)]
wrong_3_shouldbe_1.to_csv('baseline_loop_shouldbe1.csv')
wrong_3_shouldbe_0.to_csv('baseline_loop_shouldbe0.csv')

In [None]:
f = open('results_lr_loop.txt', 'w')
f.write('This is for the lr_baseline_loop.txt\n')
f.write('{0}\n'.format(parameters2))
f.write('For training models, The best AUC is {0}\n'.format(git3.best_score_))
for param_name in sorted(parameters2.keys()):
    f.write("\t%s: %r\n" % (param_name, best_para3[param_name]))
f.write('for the training set, the auc is{0}\n'.format(lr3_auc))
f.flush()

### Then compare three models and pick up the best

In [None]:
print lr1_auc, lr2_auc, lr3_auc

In [None]:
data_vali=pd.read_csv('final_dummies_validation.csv')
X_data_text=data['text'].copy()
X_data.shape
(913199,)
X_vali_text=data_vali['text'].copy()
Y_data=data['Label'].copy()
Y_test=data_vali['Label'].copy()
(391371,))

In [None]:
vectorizer=CountVectorizer(max_features=None,ngram_range=(1,2))
vectorizer.fit(X_data_text)
X_data_text=vectorizer.transform(X_data_text)
X_data_other=sp.sparse.csr_matrix(data.drop(['text','Label'],1).astype(float))
X_data=sp.sparse.hstack([X_data_text,X_data_other]) #This is the variable set, suppose ‘a’ is the sparse matrix from CountVectorizer/balabala fit-transformed data.
X_data=sp.sparse.csr_matrix(X_data)
X_vali_text=vectorizer.transform(X_vali_text)
X_vali_other=sp.sparse.csr_matrix(data_vali.drop(['text','Label'],1).astype(float))
X_vali=sp.sparse.hstack([X_vali_text,X_vali_other]) #This is the variable set, suppose ‘a’ is the sparse matrix from CountVectorizer/balabala fit-transformed data.
X_vali=sp.sparse.csr_matrix(X_vali)
lrmodel=SGDClassifier(loss='log',alpha=0.01,penalty='l2')
lrmodel.fit(X_data,Y_data.values.astype(float))
prob=lrmodel.predict_proba(X_vali.astype(float))[:,1]
vali_auc= plotROC(prob,Y_test.values.astype(float), "Validation")
proba_train=lrmodel.predict_proba(X_data)[:,1]
train_auc= plotROC(proba_train,Y_data.values.astype(float), "Validation_train")

In [None]:
### For all
df_test=pd.read_csv('final_dummies_test.csv')

##### Other things like gridsearch and pipline are tried before, but not impropriate for such big amount of data