In [374]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, StackingClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import plot_roc_curve

### deal with some left over from preprocessing

In [351]:
df = pd.read_csv('step2_data/combined.csv', index_col=0)

  df = pd.read_csv('step2_data/combined.csv', index_col=0)


In [352]:
le = LabelEncoder()
df['name'] = le.fit_transform(df['name'])
df['city'] = le.fit_transform(df['city'])
df['postal_code	'] = le.fit_transform(df['postal_code'])

In [353]:
df = df[df['RestaurantsPriceRange2'] != 'None']
df['RestaurantsPriceRange2'] = df['RestaurantsPriceRange2'].astype(float)
df = df.dropna(subset='RestaurantsPriceRange2')
df['RestaurantsPriceRange2'] = df['RestaurantsPriceRange2'].astype(int)

In [354]:
## True/False/None datatype issue: 'RestaurantsDelivery', 'BusinessAcceptsCreditCards', 'RestaurantsTakeOut', 'RestaurantsReservations', 'RestaurantsGoodForGroups'
true_false_col = ['RestaurantsDelivery', 'BusinessAcceptsCreditCards', 'RestaurantsTakeOut', 'RestaurantsReservations', 'RestaurantsGoodForGroups']

def tf_issue(col_li, df):

    for col in col_li:
        df[col] = df[col].astype('|S')
        df[col] = df[col].apply(lambda line: b'False' if line == b'nan' else line)
        df[col] = df[col].apply(lambda line: b'False' if line == b'None' else line)
        df[col] = LabelEncoder().fit_transform(df[col])
        df[col] = df[col].astype(int)

tf_issue(true_false_col, df)

In [355]:
# Weird u in the string
weird_u = ['WiFi', 'Alcohol', 'RestaurantsAttire', 'NoiseLevel', 'Smoking']

def u_issue(col_li, df):

    for col in col_li:
        df[col] = df[col].apply(lambda line: 'none' if type(line) == float else (line.split("\'")[1].lower() if len(line.split("\'")) == 3 else line.split("\'")[0].lower()))
        df[col] = LabelEncoder().fit_transform(df[col])
        df[col] = df[col].astype(int)

u_issue(weird_u, df)

In [356]:
# AcceptsInsurance, Open24Hours RestaurantsCounterService, has no useful value
# DietaryRestrictions have only 5 records
drop_col = ['business_id', 'AcceptsInsurance', 'Open24Hours', 'DietaryRestrictions', 'RestaurantsCounterService', 'Caters', 'HasTV', 'GoodForKids', 'DogsAllowed', 
            'HappyHour', 'WheelchairAccessible', 'OutdoorSeating', 'BikeParking', 'RestaurantsAttire', 'Ambience', 'Smoking', 'Music', 'GoodForDancing', 
            'BusinessAcceptsBitcoin', 'CoatCheck', 'BestNights', 'Corkage', 'BYOBCorkage', 'BYOB', 'AgesAllowed', 'ByAppointmentOnly', 'RestaurantsTableService',
            'DriveThru', 'BusinessParking', 'GoodForMeal']

df.drop(columns = drop_col, inplace=True)

In [357]:
# delete columns only with one unique values
del_col = []

for i in df.columns:
    if df[i].nunique() <= 1:
        del_col.append(i)

df.drop(columns = del_col, inplace=True)

In [364]:
df = df.dropna(axis = 0)

### Modeling

In [367]:
# prepare data for modeling
X = df.drop(columns='is_open')
y = df['is_open']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

### Model Selection

In [370]:
models = [KNeighborsClassifier(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          GradientBoostingClassifier(),
          xgb.XGBClassifier()]

scores = []
for ml in models:
    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
        scaler = preprocessing.MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        ml.fit(X_train, y_train)
        y_pred = ml.predict(X_test)
        scores.append(recall_score(y_pred, y_test))
    print(ml, 'average recall score:', sum(scores)/len(scores))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier() test accuracy: 0.6773733047822984
[[212 277]
 [175 737]]
DecisionTreeClassifier() test accuracy: 0.6388294075660242
[[240 249]
 [257 655]]
RandomForestClassifier() test accuracy: 0.7473233404710921
[[206 283]
 [ 71 841]]
GradientBoostingClassifier() test accuracy: 0.7466095645967167
[[220 269]
 [ 86 826]]
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg

### Hyperparameter tuning

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=0)

In [376]:
params = {
          'max_depth': range(2, 8, 1),
          'n_estimators': range(60, 220, 40),
          'learning_rate': [0.001, 0.005, 0.01, 0.05],
          'min_child_weight': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
          'objective':['binary:logistic', 'reg:logistic']
          }

model = xgb.XGBClassifier()

clf = GridSearchCV(
    estimator = model,
    param_grid = params,
    n_jobs = 5,
    cv = 5,
    scoring = "recall"
)
clf.fit(X_train, y_train)

xgb_best_params = clf.best_params_
print(clf.best_score_, xgb_best_params)

KeyboardInterrupt: 

### Fit final model

In [None]:
best_model = xgb.XGBClassifier(**xgb_best_params)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print('Accuracy Score : ', accuracy_score(y_test,y_pred))
print('Precision Score : ', precision_score(y_test,y_pred))
print('**Recall Score : ', recall_score(y_test,y_pred))
print('F1 Score : ', f1_score(y_test,y_pred))

confusion_matrix(y_test,y_pred)


In [None]:
xgb.plot_importance(best_model)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=123)

best_model = xgb.XGBClassifier(learning_rate= 0.05,
                               max_depth= 2,
                               min_child_weight= 0.001,
                               n_estimators= 60,
                               objective= 'binary:logistic')

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print('Accuracy Score : ', accuracy_score(y_test,y_pred))
print('Precision Score : ', precision_score(y_test,y_pred))
print('**Recall Score : ', recall_score(y_test,y_pred))
print('F1 Score : ', f1_score(y_test,y_pred))

confusion_matrix(y_test,y_pred)

xgb.plot_importance(best_model)

In [None]:
X_ = np.concatenate((X_train, X_test))
y_predict = best_model.predict(X_)
y_original = np.concatenate((y_train, y_test))
final_df = pd.DataFrame({"is_open": y_original, "pred_isopen":y_predict})

In [None]:
confusion_matrix(y_original,y_predict)

In [None]:
final_df[(final_df['pred_isopen'] == 0) & (final_df['is_open'] == 1)]

### Feature selection

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
features_name = []
for i in range(len(best_model.feature_importances_)):
    if best_model.feature_importances_[i] != 0:
        features_name.append(X.columns[i])

In [None]:
X_top_features = X[features_name]
X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size = .2, random_state=0)

best_model = xgb.XGBClassifier(learning_rate= 0.05,
                               max_depth= 2,
                               min_child_weight= 0.001,
                               n_estimators= 60,
                               objective= 'binary:logistic')

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print('Accuracy Score : ', accuracy_score(y_test,y_pred))
print('Precision Score : ', precision_score(y_test,y_pred))
print('**Recall Score : ', recall_score(y_test,y_pred))
print('F1 Score : ', f1_score(y_test,y_pred))

confusion_matrix(y_test,y_pred)

In [None]:
np.sort(best_model.feature_importances_)

In [None]:
xgb.plot_importance(best_model)

In [None]:
thresholds = np.sort(best_model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(best_model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = xgb.XGBClassifier(learning_rate= 0.05,
                                        max_depth= 2,
                                        min_child_weight= 0.001,
                                        n_estimators= 60,
                                        objective= 'binary:logistic')
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    predictions = selection_model.predict(select_X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))