In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import preprocessing, ensemble
from sklearn.model_selection import cross_val_score
 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import matplotlib.cm as cm
 
from sklearn import datasets, linear_model, metrics
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import train_test_split
#imported everything necessary
 
dataset=pd.read_csv('C:/Users/xshitova/Documents/anon_file2.csv', delimiter=';')
#importing the dataset as usual
 
cols=["numerical_feature1",
      "numerical_feature2",
      "numerical_feature3",
      "category_feature1",
      "category_feature2",
      "category_feature3",
      "category_feature4",
      "numerical_feature4",
      "numerical_feature5",
      "numerical_feature6",
      "numerical_feature7",
  	"numerical_feature8",
      "numerical_feature9",
      "numerical_feature10"]
#explanatory variables (everything except client id)
 
target=['target_variable']
#this is the predicted variable
 
dataset.dropna()#dropping empty values for now
dataset = dataset[pd.notnull(dataset['category_feature1'])] #excluding lines where there is no category_feature1
dataset = dataset[pd.notnull(dataset['category_feature2'])]
dataset = dataset[pd.notnull(dataset['category_feature3'])]
dataset = dataset[pd.notnull(dataset['category_feature4'])]
dataset = dataset[pd.notnull(dataset['numerical_feature4'])]
dataset = dataset[pd.notnull(dataset['numerical_feature8'])]
#same for all the other indicators - conseautively drop values which are na
 
for i in range(0,12):
	print([cols[i]])
    print(dataset[cols[i]].isnull().sum())
    print(dataset[cols[i]].count())
	#until everything is 0 (meaning there are no nan values in any of columns)
	#197676 lines left
 
X_train, X_test, y_train, y_test = train_test_split(dataset[cols], dataset[target], test_size = 0.3, random_state=42)
#the train test split as I haven't used it before - done manually instead
 
print("Random forest model")
random_forest1 = RandomForestClassifier()
random_forest1.fit(X_train, y_train.values.ravel()) #fitting
y_pred= random_forest1.predict(X_test)  #predicting
print("Target score - " + str(metrics.accuracy_score(y_test, y_pred)) )#model score for target vaiable
print(confusion_matrix(y_test, y_pred)) #the confusion matrix
print(classification_report(y_test, y_pred)) #the classification report
 
print("Signal  - "+str(sum(y_train["target_variable"])) +" bought it out of "+str(len(y_train))+" in train dataset")
print("Signal  - "+str(sum(y_test["target_variable"])) +" bought it out of "+str(len(y_test))+" in test dataset")
 
print(dataset[cols].corr()) #correlation values
 
#another model - with XGBoost package
xgb_model1 = XGBClassifier()
xgb_model1.fit(X_train, y_train.values.ravel())
y_pred = xgb_model1.predict(X_test) # make predictions for test data
accuracy = accuracy_score(y_test, y_pred) #accuracy score
print("Target score - " + str(accuracy)+" for XGBoost model" )#model score for target vaiable
print(confusion_matrix(y_test, y_pred)) #the confusion matrix
print(classification_report(y_test, y_pred))#the classification report
 
#ROC AUC curve for random forest model and for XGBoost model - together
y_pred_prob = random_forest1.predict_proba(X_test)[:,1]  # Compute predicted probabilities: y_pred_prob
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) # Generate ROC curve values: fpr, tpr, thresholds RF
y_pred_prob1 = xgb_model1.predict_proba(X_test)[:,1]  # Compute predicted probabilities: y_pred_prob
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_prob1) # Generate ROC curve values: fpr, tpr, thresholds XGB
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, color='red')
plt.plot(fpr1, tpr1, color='blue')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (red for Random Forest, blue for XGBoost)')
plt.show()
 
plt.matshow(dataset[cols].corr()) #the correlation matrix of features
 
#Cross validation for random forest 
cv_scores_rd10 = cross_val_score(random_forest1,dataset[cols],dataset[target],cv=10) 
print("Average 10-Fold CV Score for random forest: {}".format(np.mean(cv_scores_rd10)))
cv_scores_rd20 = cross_val_score(random_forest1,dataset[cols],dataset[target],cv=20) 
print("Average 20-Fold CV Score for random forest: {}".format(np.mean(cv_scores_rd20)))
cv_scores_rd30 = cross_val_score(random_forest1,dataset[cols],dataset[target],cv=30) 
print("Average 30-Fold CV Score for random forest: {}".format(np.mean(cv_scores_rd30)))
 
#Cross validation for XGBoost
cv_scores_xgb10 = cross_val_score(xgb_model1,dataset[cols],dataset[target],cv=10) 
print("Average 10-Fold CV Score for XGBoost: {}".format(np.mean(cv_scores_xgb10)))
cv_scores_xgb20 = cross_val_score(xgb_model1,dataset[cols],dataset[target],cv=20) 
print("Average 20-Fold CV Score for XGBoost: {}".format(np.mean(cv_scores_xgb20)))
cv_scores_xgb30 = cross_val_score(xgb_model1,dataset[cols],dataset[target],cv=30) 
print("Average 30-Fold CV Score for XGBoost: {}".format(np.mean(cv_scores_xgb30)))
#importances of features and graph of them (horizontal barplot)
importances1 = random_forest1.feature_importances_
indices1 = np.argsort(importances1)
plt.title('Random forest feature importances')
plt.barh(range(14), importances1[indices1], color='blue')
plt.yticks(range(14), cols)
plt.show()
 
importances1 = xgb_model1.feature_importances_
indices1 = np.argsort(importances1)
plt.title('XGB feature importances')
plt.barh(range(14), importances1[indices1], color='blue')
plt.yticks(range(14), cols)
plt.show()

#feature importances consecutively 
cols2=['numerical_feature1',
      'numerical_feature2',
      'numerical_feature3',
      'category_feature1',
      'category_feature2',
      'category_feature3',
      'category_feature4',
      'numerical_feature4',
      'numerical_feature5',
      'numerical_feature6',
      'numerical_feature7',
      'numerical_feature8',
      'numerical_feature9',
      'numerical_feature10']
random_forest2 = RandomForestClassifier()
random_forest2.fit(dataset[cols2], dataset[target]) #fitting
importances1 = random_forest2.feature_importances_
indices1 = np.argsort(importances1)
plt.barh(range(14), importances1[indices1], color='blue')
plt.yticks(range(14), cols2)
plt.show()