In [1]:

import csv
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, roc_auc_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

from sklearn.externals import joblib
import pandas as pd
import sys
import os
sys.path.append('/home/nick/newuser')
from imblearn.over_sampling import SMOTE

In [2]:

Location=r'C:/intern/new_user_prediction/newuser_model_sampled/0713_data_sampled.csv'
df = pd.read_csv(Location,header=0,low_memory=False,delimiter='\t')
df_platform=df.copy()


In [3]:


df_platform.loc[df_platform['platform']=='android','platform']=0
df_platform.loc[df_platform['platform']=='ios','platform']=1

backup=df.iloc[:,0:17].copy()


df1=df.copy()
df1=df1.drop(df.columns[0:17],axis=1)

df1['activity_half_reduced_8_14'] = df1['active_in_8_14']


df1.loc[df1['activity_half_reduced_8_14']==0, 'activity_half_reduced_8_14']=2
df1.loc[df1['activity_half_reduced_8_14']==1, 'activity_half_reduced_8_14']=0
df1.loc[df1['activity_half_reduced_8_14']==2, 'activity_half_reduced_8_14']=1

df1.insert(loc=0, column='platform', value=df_platform['platform'])


In [4]:


print('andriod:{}'.format(len(backup[backup['platform']=='android'])))
print('ios:{}'.format(len(backup[backup['platform']=='ios'])))
print()
print('andriod=> 1:{}'.format(len(df1[(df1['platform']==0) & (df1['active_in_8_14']==1)])))
print('andriod=> 0:{}'.format(len(df1[(df1['platform']==0) & (df1['active_in_8_14']==0)])))

print('ios=> 1:{}'.format(len(df1[(df1['platform']==1) & (df1['active_in_8_14']==1)])))
print('ios=> 0:{}'.format(len(df1[(df1['platform']==1) & (df1['active_in_8_14']==0)])))


andriod:2388
ios:2611

andriod=> 1:669
andriod=> 0:1719
ios=> 1:979
ios=> 0:1632


In [5]:


def feature_result_split(data, df_num):
  
    feature=data.iloc[:,:-5]

    if df_num==1:
        result=data.iloc[:,-4]


    if df_num==2:
        result=data.iloc[:,-1]


    if df_num==3:
        result=data.iloc[:,-3]


    print('df:',df_num, '1 vs. 0:',len(result[result==1]), len(result[result==0]))

  
    return feature, result


In [6]:

#feature and result split

X_1, Y_1 = feature_result_split(df1, 1)
X_2, Y_2 = feature_result_split(df1, 2)
X_3, Y_3 = feature_result_split(df1, 3)


df: 1 1 vs. 0: 1648 3351
df: 2 1 vs. 0: 41 4958
df: 3 1 vs. 0: 3351 1648


In [7]:


def train_test_smote(feature_set, result_set, backup):

    smt=SMOTE(ratio='auto', random_state=10, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=-1)

    #do the train test data split randomly

  
    subset=np.floor(len(feature_set)*0.6).astype('int')


    np.random.seed(1)
    index_whole=np.random.choice(feature_set.index,subset,replace=False)

    np.random.seed(1)
    index_validate=np.random.choice(pd.Index(index_whole), np.floor(subset/2).astype('int'), replace=False)

    index_test=pd.Index(index_whole).difference(pd.Index(index_validate))

    index_train=feature_set.index.difference(pd.Index(index_whole))

    training_f=feature_set.loc[index_train,:]
    training_r=result_set.loc[index_train]

    validate_f=feature_set.loc[index_validate,:]
    validate_r=result_set.loc[index_validate]
    backup_validate=backup.loc[index_validate,:]

    test_f=feature_set.loc[index_test,:]
    test_r=result_set.loc[index_test]
    backup_test=backup.loc[index_test,:]


    #SMOTE

    training_f_af, training_r_af=smt.fit_sample(training_f, training_r)

   
    print('balanced data ratio in training set:')
    print(len(training_r_af[training_r_af==0]) / len(training_r_af[training_r_af==1]))
    print('balanced data ratio in test set:')
    print(len(validate_r[validate_r==0]) / len(validate_r[validate_r==1]))

    return training_f_af, training_r_af, validate_f, validate_r, backup_validate, test_f, test_r, backup_test



In [8]:


X1_train, Y1_train, X1_validate, Y1_validate, backup1_validate, X1_test, Y1_test, backup1_test = train_test_smote(X_1, Y_1, backup)
X2_train, Y2_train, X2_validate, Y2_validate, backup2_validate, X2_test, Y2_test, backup2_test = train_test_smote(X_2, Y_2, backup)
X3_train, Y3_train, X3_validate, Y3_validate, backup3_validate, X3_test, Y3_test, backup3_test = train_test_smote(X_3, Y_3, backup)



balanced data ratio in training set:
1.0
balanced data ratio in test set:
2.090721649484536
balanced data ratio in training set:
1.0
balanced data ratio in test set:
123.91666666666667
balanced data ratio in training set:
1.0
balanced data ratio in test set:
0.47830374753451677


In [9]:


def train(X_train, X_test, Y_train, Y_test, backup_test,name):
    print('Training models...')
    best_model = None
    best_ratio, best_ne, best_md = 0, 0, 0
    #for ne in np.arange(30,500,50):
    #    for md in np.arange(10,200,50):
    for ne in [100]:
        for md in [10]:

            rf = RandomForestClassifier(max_depth=md, n_estimators=ne,n_jobs=-1, random_state = 10)
            rf_fit=rf.fit(X_train, Y_train)
            prediction=rf_fit.predict(X_test)
            prediction_pro=rf_fit.predict_proba(X_test)
          
            roc = roc_auc_score(Y_test, prediction_pro[:,1])
           
            #acc = accuracy_score(Y_test,prediction)
            #temp=pd.DataFrame()

            #temp['Y_test']=Y_test
            #temp['Y_predict']=prediction
            #temp['prob']=prediction_pro[:,1]
            #temp=temp.sort_values(by='prob',ascending=False)
            #temp=temp[temp['prob']>=0.7]

            #count=len(temp[(temp['Y_test']==1) & (temp['Y_predict']==1)])
            #ratio=count / len(temp)
            ratio=roc
            print('-----------------------------------------------------------')
            if ratio > best_ratio:
                best_md = md
                best_ne = ne
                best_ratio = ratio
                best_model = rf_fit
            
            print('n_estimators:  max_depth:',(ne, md))
            print('confusion matrix\n', confusion_matrix(Y_test,prediction))
            print('ratio:{0}'.format(ratio))
            #print('roc_auc',roc_auc_score(Y_test, prediction_pro[:,1]))
            #print('accuracy',accuracy_score(Y_test,prediction))
            print()
            print()
   
    print('best_n:{0}, best_depth:{1}'.format(best_ne, best_md))
    print('best_ratio using validation set:{0}'.format(best_ratio))

    
    joblib.dump(best_model, 'newuser_best_model_{}_0715.pkl'.format(name))

    print('best_model written to file!')
    return best_model


In [14]:

def evaluate(model, X_test, Y_test, backup_test):
    print('Evalluating models...')
    prediction_pro=model.predict_proba(X_test)
    prediction=model.predict(X_test)
  
    roc = roc_auc_score(Y_test, prediction_pro[:,1])
    ratio=roc

    #temp=backup_test.copy()
    #temp['Y_test']=Y_test
    #temp['Y_predict']=prediction
    #temp['prob']=prediction_pro[:,1]
    #temp=temp.sort_values(by='prob',ascending=False)
    #temp=temp[temp['prob']>=0.7]
    #count=len(temp[(temp['Y_test']==1) & (temp['Y_predict']==1)])
    #ratio=count / len(temp)
    print('The ratio using evaluation set:{0}'.format(ratio))

    outfile=pd. DataFrame()
    outfile=backup_test.copy()
    outfile['label']=Y_test
    outfile['prob']=prediction_pro[:,1]


    #print('andriod=> 1:{}'.format(len(outfile[(outfile['platform']=='android') & (outfile['label']==1) & (outfile['prob']>=0.64)])  /  len(outfile[(outfile['platform']=='android')  & (outfile['prob']>=0.64)])))

    #print('ios=> 1:{}'.format(len(outfile[(outfile['platform']=='ios') & (outfile['label']==1) & (outfile['prob']>=0.64)])  /  len(outfile[(outfile['platform']=='ios')  & (outfile['prob']>=0.64)])))


    print('Evaluate Done!')
    return outfile



In [11]:


model1 = train(X1_train, X1_validate, Y1_train, Y1_validate, backup1_validate,'active_in_8_14')
model2 = train(X2_train, X2_validate, Y2_train, Y2_validate, backup2_validate,'active_in_84_90')
model3 = train(X3_train, X3_validate, Y3_train, Y3_validate, backup3_validate,'activity_half_reduced_8_14')


Training models...
-----------------------------------------------------------
n_estimators:  max_depth: (100, 10)
confusion matrix
 [[868 146]
 [196 289]]
ratio:0.7979096768946095


best_n:100, best_depth:10
best_ratio using validation set:0.7979096768946095
best_model written to file!
Training models...
-----------------------------------------------------------
n_estimators:  max_depth: (100, 10)
confusion matrix
 [[1474   13]
 [  12    0]]
ratio:0.6978536202645146


best_n:100, best_depth:10
best_ratio using validation set:0.6978536202645146
best_model written to file!
Training models...
-----------------------------------------------------------
n_estimators:  max_depth: (100, 10)
confusion matrix
 [[289 196]
 [146 868]]
ratio:0.7979096768946095


best_n:100, best_depth:10
best_ratio using validation set:0.7979096768946095
best_model written to file!


In [15]:



out_file1 = evaluate(model1, X1_test, Y1_test, backup1_test)
out_file2 = evaluate(model2, X2_test, Y2_test, backup2_test)
out_file3 = evaluate(model3, X3_test, Y3_test, backup3_test)


Evalluating models...
The ratio using evaluation set:0.8255334877531911
Evaluate Done!
Evalluating models...
The ratio using evaluation set:0.6865771812080538
Evaluate Done!
Evalluating models...
The ratio using evaluation set:0.825533487753191
Evaluate Done!


In [16]:

out_file1=out_file1.rename(columns={'label':'active_in_8_14_label','prob':'active_in_8_14_probability'})
out_file2=out_file2.rename(columns={'label':'active_in_84_90_label','prob':'active_in_84_90_probability'})
out_file3=out_file3.rename(columns={'label':'activity_half_reduced_8_14_label','prob':'activity_half_reduced_8_14_probability'})
out_file=pd.DataFrame()
out_file=pd.merge(out_file1, out_file2[['active_in_84_90_label','active_in_84_90_probability']],left_index=True, right_index=True)
out_file=pd.merge(out_file, out_file3[['activity_half_reduced_8_14_label','activity_half_reduced_8_14_probability']],left_index=True, right_index=True)

path = 'C:/intern/new_user_prediction/newuser_model_sampled'
out_file.to_csv(os.path.join(path,'0713_data_evaluation.csv'),index=False)
print('Evaluation written to file!')



Evaluation written to file!


In [17]:

def predict(path, pklpath, model_name):
    print('Starting prediction...')
    from sklearn.externals import joblib
    df = pd.read_csv(path,header=0,low_memory=False,delimiter='\t')


  
    df_platform=df.copy()
    df_platform.loc[df_platform['platform']=='android','platform']=0
    df_platform.loc[df_platform['platform']=='ios','platform']=1


    df1=df.copy()
    df1=df1.drop(df.columns[0:17],axis=1)

    df1.insert(loc=0, column='platform', value=df_platform['platform'])
    df2=df.copy()

    
    df_new=pd.DataFrame()
    df_new['site_id']=df2['site_id']
    df_new['device_id']=df2['device_id']
    df_new['cookie_id']=df2['cookie_id']
    df_new['data_installed']=df2['data_installed']
    df_new['platform']=df2['platform']
    df_new['country']=df2['country']

    df_new['day1_prod_num']=df2['day1_prod_num']
    df_new['day2_prod_num']=df2['day2_prod_num']
    df_new['day3_prod_num']=df2['day3_prod_num']
    df_new['day4_prod_num']=df2['day4_prod_num']
    df_new['day5_prod_num']=df2['day5_prod_num']
    df_new['day6_prod_num']=df2['day6_prod_num']
    df_new['day7_prod_num']=df2['day7_prod_num']


    feature=df1.iloc[:,:]

  
    pkl = open(os.path.join(pklpath,model_name),'rb')
    model = joblib.load(pkl)

    prediction = model.predict(feature)
    prediction_pro = model.predict_proba(feature)

    #feature importance

    
    featurename=feature.columns
    importances=model.feature_importances_
    indices = np.argsort(importances)[::-1]
    fi = pd.DataFrame()
    fi['feature_name'] = featurename[indices]
    fi['importance'] = importances[indices]


    print('Prediction complete!')

  
    return prediction, prediction_pro[:,1], df_new, fi
   


In [19]:


path='C:/intern/new_user_prediction/newuser_model_sampled/0715_test_sampled.csv'
prediction1, prediction1_proba, backup, fi1= predict(path,'C:/intern/new_user_prediction/newuser_model_sampled','newuser_best_model_active_in_8_14_0715.pkl')

outfile=pd.DataFrame()
outfile=backup
outfile['active_in_8_14_probability'] = prediction1_proba


Starting prediction...
Prediction complete!


In [20]:

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#for A B test
index_A=[]
index_B=[]
for i in range(0,len(outfile)):
    if outfile.iloc[i]['cookie_id'][7] not in ['0','1','2','3','4','5','6','7']:
        index_B.append(i)
    else:
        index_A.append(i)

outfile_A=outfile.loc[index_A,:]
outfile_B=outfile.loc[index_B,:]


outfile_A=outfile_A.sort_values(by='active_in_8_14_probability', ascending=False)
outfile_B=outfile_B.sort_values(by='active_in_8_14_probability', ascending=False)
outfile=outfile.sort_values(by='active_in_8_14_probability', ascending=False)

In [21]:


print('Writing to file...')
path = 'C:/intern/new_user_prediction/newuser_model_sampled'
outfile_A.to_csv(os.path.join(path, '0715_test_A_prediction.csv'),index=False)
outfile_B.to_csv(os.path.join(path, '0715_test_B_prediction.csv'),index=False)
outfile.to_csv(os.path.join(path, '0715_test_prediction.csv'),index=False)
print('Finish writing to file.')


fi = pd.DataFrame()
fi['feature_active_in_8_14'] = fi1['feature_name']
fi['importance_active_in_8_14'] = fi1['importance']


fi.to_csv(os.path.join(path, '0715_test_prediction_feature_importances.csv'),index=False)
print('Finish writing feature importances to file.')
print('All complete!')



Writing to file...
Finish writing to file.
Finish writing feature importances to file.
All complete!


'\n注释结束\n'