In [None]:
# data processing
import pandas as pd
import numpy as np
# modeling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import linear_model
from sklearn import tree
# evaluation
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve,auc
from scipy.stats import ks_2samp
from sklearn.tree.export import export_text
import math
# visualization
import matplotlib.pyplot as plt
import pydotplus 
from IPython.display import Image
# ignoring the warning message
import warnings
warnings.filterwarnings("ignore")

#### Original Data

In [None]:
df = pd.read_excel('original_dataset.xlsx')
df.info()

#### Final Modeling Data

In [None]:
# gather all variables data and response data
df_processed = pd.read_excel('tree_data_0410_new.xlsx')
# because some companies don't have sectors or have wrong setors, we just simply drop these companies
df_drop = df_processed.dropna()
# finally we have 5211 obersevations, each obervation represents one public company
df_drop.info()

#### Logistic Regression

In [None]:
# get the logistic regression data
lr = df_drop[['page_size_bytes', 'latest_edit_year_-_2018', 'avg_day_between_edits', 
              'avg_edits_per_year', 'num_edits_2018', 'num_edits_2017', 'edits_change_2017_2018', 
              'pageviews_60d', 'num_sections', 'num_redir_links', 'num_references', 
              'margin_change', 'margin_change_90', 'margin_change_80']]
# define logistic regression function
def logistic(lr, y):
    results = dict()
    for i in cols:
        data = lr[[i, y]]
        results[i] = dict()
        # sample splitting
        train, test = train_test_split(data, test_size = 0.3, random_state = 240)
        x_train = train.iloc[0:, 0:1]
        y_train = train[[y]]
        x_test = test.iloc[0:, 0:1]
        y_test = test[[y]]
        # fit and predict
        model = linear_model.LogisticRegression(class_weight="balanced")
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        actuals = np.array(y_test)
        # accuracy
        acc = model.score(x_test,actuals)
        results[i]['accuracy'] = acc
        # gini
        fpr, tpr, thresholds = roc_curve(actuals, predictions)
        roc_auc = auc(fpr, tpr)
        gini = (2 * roc_auc) - 1
        if gini < 0:
            predictions = 1 - predictions
            fpr, tpr, thresholds = roc_curve(actuals, predictions)
            roc_auc = auc(fpr, tpr)
            gini = (2 * roc_auc) - 1
        results[i]['gini'] = gini
        # ks statistic
        ks = ks_2samp(actuals.ravel(), predictions)
        results[i]['ks_stats'] = ks[0]
    return results
# get different lr results for different binary thresholds of margin change
p95 = logistic(lr, 'margin_change')
p90 = logistic(lr, 'margin_change_90')
p80 = logistic(lr, 'margin_change_80')
# output the results
# pd.DataFrame(p95).transpose().to_excel('lr_p95.xlsx')
# pd.DataFrame(p90).transpose().to_excel('lr_p90.xlsx')
# pd.DataFrame(p80).transpose().to_excel('lr_p80.xlsx')

#### Decision Tree

In [None]:
# get the data for decision tree
tree_data = pd.read_excel('dataset_0412_1.xlsx', index_col="Unnamed: 0")
# set the varibles for decision tree
tree_model = tree_data[['margin_change_90_new', 'Sector',
       'Shares_Held_by_All_Insider', 'Shares_Held_by_Institutions',
       'Float_Held_by_Institutions', 'Number_of_Institutions_Holding_Shares',
       'number_of_full-time_employee', 'male_rate', 'average_age',
       'age_below_40', 'tech_score', 'page_size_bytes',
       'latest_edit_year_-_2018', 'avg_day_between_edits',
       'avg_edits_per_year', 'num_edits_2018', 'num_edits_2017',
       'edits_change_2017_2018', 'pageviews_60d', 'num_sections',
       'num_redir_links', 'num_references', '2017-create_date', 'followers',
       'friends', 'tweets', 'verified', 'avg_retweet', 'avg_favourite',
       'Facebook', 'Twitter', 'Linkedin', 'Youtube', 'Instagram', 'Wikipedia',
       'media_pct', 'subscriberCount', 'videoCount', 'liked', 'disliked',
       'views', 'comment', 'video_2017', 'video_2018', 'first']]
# drop companies with wrong sectors all no sectors
tree_drop = tree_model.dropna()
tree_model = tree_model.dropna()
# set sector to dummy variables
tree_model = pd.get_dummies(tree_model, columns=['Sector'])
# split the dataset into train and test
train, test = train_test_split(tree_model, test_size = 0.3, random_state=240)
x_train = train.iloc[0:, 1:]
y_train = train[['margin_change_90_new']]
x_test = test.iloc[0:, 1:]
y_test = test[['margin_change_90_new']]
# an attempt to handle the imbalanced data
# oversample = SMOTE()
# x_train, y_train = oversample.fit_resample(x_train, y_train)
# establish a decision tree
model = tree.DecisionTreeClassifier(criterion="gini", max_leaf_nodes=10, min_samples_leaf=100, random_state=123, class_weight='balanced')
model.fit(x_train,y_train)
predictions = model.predict(x_test)
actuals = np.array(y_test)

In [None]:
# show the tree image
feature_names = [key for key in tree_model if not key == 'margin_change_90_new']
dot_data = tree.export_graphviz(model, out_file=None, feature_names=feature_names, class_names=['0', '1']) 
graph = pydotplus.graphviz.graph_from_dot_data(dot_data)
Image(graph.create_png())

# evaluate the tree performance
fpr, tpr, thresholds = roc_curve(actuals, predictions)
roc_auc = auc(fpr, tpr)
gini = (2 * roc_auc) - 1
if gini < 0:
    predictions = 1 - predictions
    fpr, tpr, thresholds = roc_curve(actuals, predictions)
    roc_auc = auc(fpr, tpr)
    gini = (2 * roc_auc) - 1
# ks statistic
ks = ks_2samp(actuals.ravel(), predictions)
print(gini)
print(ks[0])
print(ks[1])

# show how many nodes and leaves the tree have
print(model.tree_.node_count)
print(model.tree_.n_leaves)
# show the split rules of each leaf node
tree_rules = export_text(model, feature_names=list(x_train))
print(tree_rules)

# show the feature importance
importances = pd.DataFrame({'feature':x_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
importances

In [None]:
# calculate the event rate of each leaf node
train['leaf'] = model.apply(x_train)
grouped = train.groupby(['leaf', 'margin_change_90_new']).size().unstack()
grouped = grouped.replace(np.nan, 0)
grouped['sum'] = grouped[0.0] + grouped[1.0]
grouped['event_rate'] = grouped[1.0]/grouped['sum']
grouped
# predict the leaf node each company belongs to for overall dataset
tree_model['leaf'] = model.apply(tree_model.iloc[0:, 1:])
# join the leaf column with original dataset for detailed analysis
tree_90_data = tree_drop[['Sector']].merge(tree_model, how='left', left_index=True, right_index=True)
# tree_90_data.to_excel("tree_90.xlsx")

#### Analysis and Visulization

In [None]:
# draw the distribution of original numerical margin change
df[['margin_change']].plot.hist(grid=True, bins=500, rwidth=0.9, legend=None)
plt.xlim([-50, 50])
plt.title('Distribution of Margin Change', fontsize=12)
plt.xlabel('Margin Change', fontsize=12)
plt.ylabel('Number of Companies', fontsize=12)
plt.show()

In [None]:
# draw the distribution of binary margin change
df_binary = df[(df['margin_change_90'] == 0) | (df['margin_change_90'] == 1)]
df_binary['margin_change_90'].value_counts().plot(kind='bar')
plt.title('Distribution of Margin Change', fontsize=12)
plt.xlabel('Margin Change', fontsize=12)
plt.ylabel('Number of Companies', fontsize=12)
plt.show()