# Logistic Regression

In [None]:
# read data
import pandas as pd
import numpy as np 
from pandas import read_excel
df_new = pd.read_excel('woe by sector.xlsx')

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
from sklearn.metrics import roc_curve, auc
from scipy.stats import ks_2samp

results = dict()
variables = ['subscriberCount', 'videoCount', 'liked','disliked', 'views', 'comment', 'video_2017', 'video_2018', 'first',]
for i in variables:
    data = df_new[[i, 'margin_change_95']]
    results[i] = dict()
    # sample split
    random.seed(500)
    train, test = train_test_split(data, test_size = 0.3)
    x_train = np.array(train.iloc[0:, 0:1])
    y_train = np.array(train[['margin_change_95']])
    x_test = np.array(test.iloc[0:, 0:1])
    y_test = test['margin_change_95']
    # fit and predict
    model = linear_model.LogisticRegression(class_weight="balanced")
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    actuals = np.array(y_test)
    # accuracy
    acc = model.score(x_test,actuals)
    results[i]['accuracy'] = acc
    # gini
    fpr, tpr, thresholds = roc_curve(actuals, predictions)
    roc_auc = auc(fpr, tpr)
    gini = (2 * roc_auc) - 1
    if gini < 0:
        predictions = 1 - predictions
        fpr, tpr, thresholds = roc_curve(actuals, predictions)
        roc_auc = auc(fpr, tpr)
        gini = (2 * roc_auc) - 1
        results[i]['gini'] = gini
    else:
        results[i]['gini'] = gini
    #k-s statisctics 
    ks_result = ks_2samp(actuals, predictions)
    results[i]['ks_stats'] = ks_result[0]
    results[i]['ks_pvalue'] = ks_result[1]


# Decision Tree

In [None]:
# indepentent varaibles
cols = ['Shares_Held_by_All_Insider', 'Shares_Held_by_Institutions',
       'Float_Held_by_Institutions', 'Number_of_Institutions_Holding_Shares',
       'number_of_full-time_employee', 'male_rate', 'average_age',
       'age_below_40', 'tech_score', 'page_size_bytes',
       'latest_edit_year_-_2018', 'avg_day_between_edits',
       'avg_edits_per_year', 'num_edits_2018', 'num_edits_2017',
       'edits_change_2017_2018', 'pageviews_60d', 'num_sections',
       'num_redir_links', 'num_references', '2017-create_date', 'followers',
       'friends', 'tweets', 'verified', 'avg_retweet', 'avg_favourite',
       'Facebook', 'Twitter', 'Linkedin', 'Youtube', 'Instagram', 'Wikipedia',
       'media_pct', 'subscriberCount', 'videoCount', 'liked', 'disliked',
       'views', 'comment', 'video_2017', 'video_2018', 'first', 'Sector_Basic Materials',
       'Sector_Communication Services', 'Sector_Consumer Cyclical',
       'Sector_Consumer Defensive', 'Sector_Energy',
       'Sector_Financial Services', 'Sector_Healthcare', 'Sector_Industrials',
       'Sector_Real Estate', 'Sector_Technology']

In [None]:
# split data
import random
import numpy as np
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_new, test_size = 0.3,random_state=101)
x_train = train[cols]
y_train = train[['margin_change_90']]
x_test = test[cols]
y_test = test[['margin_change_90']]

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import pydotplus
import graphviz
from IPython.display import Image
model = tree.DecisionTreeClassifier(min_samples_leaf=100, random_state=123,class_weight = 'balanced', max_leaf_nodes = 10)
model = model.fit(x_train,y_train)
predictions = model.predict(x_test)
actuals = np.array(y_test)
feature_names = cols
dot_data = tree.export_graphviz(model, out_file=None, feature_names=cols,class_names=['0','1']) 
graph = pydotplus.graphviz.graph_from_dot_data(dot_data)
# graph.write_png('updated y balanced decision tree p50 10 nodes.png')
Image(graph.create_png())

In [None]:
from sklearn.metrics import roc_curve, auc
from scipy.stats import ks_2samp
# gini
fpr, tpr, thresholds = roc_curve(actuals, predictions)
roc_auc = auc(fpr, tpr)
gini = (2 * roc_auc) - 1
print('gini: ', gini)
# #k-s statisctics 
act = np.ravel(actuals)
ks_result = ks_2samp(act, predictions)
print('ks_stats',ks_result[0])
print('ks_pvalue', ks_result[1])
# logical rules
from sklearn.tree import export_text
tree_rules = export_text(model, feature_names=list(x_train))
print(tree_rules)
# feature importance
importances = pd.DataFrame({'feature':x_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)

In [None]:
# event rate in each node
leaf_list  = list(model.apply(x_train,check_input=True)) # leaf list 
final_node = list(np.unique(model.apply(x_train,check_input=True))) # leaf node
actual_value = list(train['margin_change_90'])
event_rate_leaf = pd.DataFrame({'leaf':leaf_list, 'actual_value':actual_value})
group_data = event_rate_leaf.groupby(['leaf','actual_value']).size().unstack()
group_data = group_data.replace(np.nan, 0)
group_data['sample number'] = group_data[0] + group_data[1]
group_data['event rate'] = group_data[1]/group_data['sample number']