In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
# from tqdm.auto import tqdm
from tqdm import tqdm, tqdm_notebook

tqdm.pandas(tqdm_notebook)

from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_absolute_error

import xgboost as xgb
import catboost as cb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from sklearn.preprocessing import normalize

import collections

import random

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
total_data = pd.read_csv('data/data.csv')
total_data.drop('Unnamed: 0', inplace = True,axis =1 )
total_data['shot_id_number'] = range(1,30698)
data_test = total_data[total_data.is_goal.isnull()]
data_train = total_data[~total_data.is_goal.isnull()]

In [None]:
print("Train_size : ", data_train.shape[0])
print("Test_size : ", data_test.shape[0])
print("Total_size : ", total_data.shape[0])

In [None]:
def autolabel(rects, ax):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., height,
                '%d' % int(height),
                ha='center', va='bottom')
        
def show_unique_and_nans(df):
    df_temp = pd.DataFrame(columns=['Name', 'Unique_Values', 'NaNs'])
    for i, col in enumerate(df.columns):
        df_temp.loc[i] = [col, df[col].nunique(), df[col].isna().sum()]
    df_temp.index = df_temp['Name']
    df_temp.drop('Name', inplace=True, axis = 1)
    
    fig, ax = plt.subplots(figsize = (15,5))
    width = 0.35
    ind = np.arange(len(df.columns))
    rects1 = ax.bar(ind, df_temp.Unique_Values, width, color='r')
    rects2 = ax.bar(ind + width, df_temp.NaNs, width, color='y')
    ax.set_ylabel('Number of values')
    ax.set_title('Unique values and NaNs in every column')
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(df.columns,  rotation = 90)
    ax.legend(['Unique Values', 'NaNs'],loc = "best")
    autolabel(rects1,ax)
    autolabel(rects2, ax)
    plt.tight_layout()
    plt.show()

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
def feature_engg(df):
    df['dxp'] = df['power_of_shot'] * df['distance_of_shot']
#     df['dxp.1'] = df['power_of_shot.1'] * df['distance_of_shot.1']
    
#     df['dxp_final'] = (df['power_of_shot'] + df['power_of_shot.1']) * (df['distance_of_shot'] + df['distance_of_shot.1'])
    
    df['lat'], df['lng'] = df['lat/lng'].str.split(',', 1).str
    df['lat'] = df['lat'].astype(np.float64)
    df['lng'] = df['lng'].astype(np.float64)
#     df['fx'] = df['location_x'] + df['lat']
#     df['fy'] = df['location_y'] + df['lng']
    
    df['tot_remaining_time'] = df['remaining_min'] + df['remaining_sec']
#     df['tot_remaining_time.1'] = df['remaining_min.1'] + df['remaining_sec.1']
    
#     df['total_remaining_time_final'] = df['remaining_min'] + df['remaining_sec'] \
#                                         + df['remaining_min.1'] + df['remaining_sec.1']
    df.drop(['power_of_shot.1','distance_of_shot.1', 'remaining_min.1' , 'remaining_sec.1' ], inplace =True , axis = 1)
    
    return df
    
def impute_most_frequent(col, df):
    df[col] = df[col].fillna(df[col].value_counts().index[0])

def impute_mean(col, df):
    df[col].fillna(value = df[col].mean(),inplace=True)
    
def remove_cols(col_list, df):
    df.drop(columns = col_list, inplace = True, axis = 1 )
    
def custom_impute_fun(grp, most_freq_cols):
    for col in grp.columns:
        if len(grp[col].value_counts()) > 0:
            grp[col] = grp[col].fillna(grp[col].value_counts().index[0])
        else:
            grp[col] = grp[col].fillna(0)
    
    return grp

def isNan(val):
    return val != val

def impute_values(df, grp_col_name = None):
    shot_type_str = []
    shot_type_cat = []
    
    shot_basics = collections.defaultdict(list)
    range_of_shot = collections.defaultdict(list)
    area_of_shot = collections.defaultdict(list)
    
    
    for i, row in tqdm(df.iterrows(), total = len(df)):
        # merging type of shot and combined type into one column and adding a column specifying type
        if isNan(row['type_of_shot']):
            if isNan(row['type_of_combined_shot']):
                shot_type_str.append('None')
                shot_type_cat.append(0)
            else:
                shot_type_str.append(str(row['type_of_combined_shot']))
                shot_type_cat.append(1)
        else:
            shot_type_str.append(str(row['type_of_shot']))
            shot_type_cat.append(2)
        
        ####################
        if isNan(row['shot_basics']) == False:
            if isNan(row['range_of_shot']) == False:
                shot_basics['range_of_shot'].append(row['shot_basics'])
            
            if isNan(row['area_of_shot']) == False:
                shot_basics['area_of_shot'].append(row['shot_basics'])
                
        if isNan(row['range_of_shot']) == False:
            if isNan(row['shot_basics']) == False:
                range_of_shot['shot_basics'].append(row['range_of_shot'])
            
            if isNan(row['area_of_shot']) == False:
                range_of_shot['area_of_shot'].append(row['range_of_shot'])
                
        if isNan(row['area_of_shot']) == False:
            if isNan(row['shot_basics']) == False:
                area_of_shot['shot_basics'].append(row['area_of_shot'])
            
            if isNan(row['range_of_shot']) == False:
                area_of_shot['range_of_shot'].append(row['area_of_shot'])
    
    sb = []
    ros = []
    aos = []
    
    for i, row in tqdm(df.iterrows(), total = len(df)):
        if isNan(row['shot_basics']):
            if isNan(row['range_of_shot']) == False:
                sb.append(random.choice(shot_basics['range_of_shot']))
            
            elif isNan(row['area_of_shot']) == False:
                sb.append(random.choice(shot_basics['area_of_shot']))
                
            else:
                sb.append("None")
        else:
            sb.append(row['shot_basics'])
                
        if isNan(row['range_of_shot']):
            if isNan(row['shot_basics']) == False:
                ros.append(random.choice(range_of_shot['shot_basics']))
            
            elif isNan(row['area_of_shot']) == False:
                ros.append(random.choice(range_of_shot['area_of_shot']))
            else:
                ros.append("None")
        else:
            ros.append(row['range_of_shot'])
                
        if isNan(row['area_of_shot']):
            if isNan(row['range_of_shot']) == False:
                aos.append(random.choice(area_of_shot['range_of_shot']))
            
            elif isNan(row['shot_basics']) == False:
                aos.append(random.choice(area_of_shot['shot_basics']))
            else:
                aos.append("None")
        else:
            aos.append(row['area_of_shot'])
                
            
            
            
        
            
    df = df.assign(shot_type=shot_type_str)
    df = df.assign(shot_type_category=shot_type_cat)
    df.drop(['type_of_shot', 'type_of_combined_shot'], axis = 1, inplace = True)  
    
    df = df.assign(sb = sb)
    df = df.assign(ros = ros)
    df = df.assign(aos = aos)
    
    df.drop(['range_of_shot', 'shot_basics', 'area_of_shot'], axis = 1, inplace = True) 
    
                
        
    return df.groupby('match_id').progress_apply(custom_impute_fun)

def split_dates(df):
    for col in df.columns:
        if col.find('date') != -1:
            df[col] = pd.to_datetime(df[col])
            df['Year'] = df[col].dt.year
            df['Month'] = df[col].dt.month
            df['Day'] = df[col].dt.day
            df.drop(col, axis = 1, inplace =True)
    
    return df

def preprocess_data(df, drop_cols = ['team_name', 'team_id', 'shot_id_number']):
    # impute values
    print('Imputing Values')
    df = impute_values_groups(df)
    
    # drop useless columns
    print('Dropping columns')
    remove_cols(drop_cols, df)
                
    #Split date  columns into year, month , and day
    df = split_dates(df)
    
    #Random FE
    df = feature_engg(df)
    
    
    # convert to str of respective columns for labelencoder
    print('Converting types')
    for col in df.columns:
        if df[col].dtype.kind == 'O':
            print('Columns to str : ', col)
            df[col] = df[col].str

            
    #LabelEncode
    print('Encoding Values..')
    df_encoded = df.progress_apply(LabelEncoder().fit_transform) 
    
    return df_encoded

In [None]:
data_train.head(n = 50)

In [None]:
data_train.type_of_shot.isnull().sum(), len(data_train[(~data_train.type_of_shot.isnull()) & (~data_train.type_of_combined_shot.isnull())])

In [None]:
data_train.unique()

In [None]:
category_cols = ['power_of_shot', 'knockout_match', 'game_season', 'home/away', 'Year', 'Month', 'Day', 'shot_basics', 'lat', 'lng','area_of_shot','type_of_shot', 'type_of_combined_shot', 'power_of_shot.1', 'knockout_match.1']

In [None]:
data_train = impute_values(data_train)

In [None]:
data_train_en = preprocess_data(data_train)
data_train_en.head()

In [None]:
data_train_en.head()

In [None]:
X_train, y_train = data_train_en.loc[:, data_train_en.columns != 'is_goal'], data_train_en['is_goal']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)

In [None]:
data_test.drop(['is_goal'], inplace = True, axis = 1)
text_x = preprocess_data(data_test)

In [None]:
END

## Modelling

In [None]:
gbm = xgb.XGBClassifier().fit(X_train, y_train)

In [None]:
lr = LogisticRegression(C = 0.1).fit(X_train, y_train)

In [None]:
rf = RandomForestClassifier(n_estimators=300,max_depth= 3).fit(X_train,y_train)

In [None]:
voting_cfl = VotingClassifier (
        estimators = [('xgb', gbm), ('lt', lr), ('rf', rf)],
                     voting='soft', weights = [1.33, 1, 1])
vcfl = voting_cfl.fit(X_train,y_train)

In [None]:
predictions = lr.predict_proba(text_x)[:,1]#.max(axis = 1)

In [None]:
print("Features sorted by their score:")
for val in sorted(zip(map(lambda x: round(x, 4), gbm.feature_importances_), data_train_en.loc[:, data_train_en.columns != 'is_goal'].columns), 
             reverse=True):
    print(val)
# print(sorted(zip(map(lambda x: round(x, 4), gbm.feature_importances_), data_train_en.loc[:, data_train_en.columns != 'is_goal'].columns), 
#              reverse=True))

In [None]:
predictions.max()

In [None]:
predictions

In [None]:
END

In [None]:
gbm = xgb.XGBClassifier()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.01, 0.02, 0.03,0.1, 1], #so called `eta` value
              'max_depth': [3,4,5,6,7,8],
              'min_child_weight': [2,3,4,5,7],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.5],
              'n_estimators': [300, 400,500, 600], #number of trees
              'seed': [1337],
              'missing': [9999999999]}

clf = GridSearchCV(gbm, parameters, n_jobs=2, 
                   cv=StratifiedKFold(n_splits=4, shuffle=True), 
                   verbose=2, refit=True,scoring='neg_mean_absolute_error')

clf.fit(X_train, y_train)



In [None]:
clf.best_params_

In [None]:
best_parameters, score = clf.best_params_ , clf.best_score_
print('neg_mean_absolute_error:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
        
mea = mean_absolute_error(y_train, clf.predict_proba(X_train)[:,1])
print('Overall mean_absolute_error:', mea, 1/(1 + mea))

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)

In [None]:
predictions = clf.predict_proba(text_x)[:,1]#.max(axis = 1)#

In [None]:
predictions.max(), predictions.min(), predictions.mean()

In [None]:
# mean_absolute_error(y_train, predictions), 1 / (1 + mean_absolute_error(y_train, predictions))

In [None]:
pred_final = predictions#.max(axis = 1)

In [None]:
predictions

In [None]:
pred_final

In [None]:
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.ix[train_index]
        test_labels = train_label.ix[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

In [None]:
def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in chain(ps.grid_search(['border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
    return ps.bestparam()

In [None]:
clf = cb.CatBoostClassifier()#, loss_function='CrossEntropy')
cat_dims = [X_train.columns.get_loc(i) for i in category_cols[:]] 
clf.fit(X_train, np.ravel(y_train), cat_features=cat_dims)

In [None]:
predictions = clf.predict_proba(text_x)

In [None]:
predictions

In [None]:
# sample_sub_df = pd.read_csv('data/sample_submission.csv')
# sample_sub_df.head()

In [None]:
# final_df['shot_id_number'] = pd.read_csv('data/sample_submission.csv')['shot_id_number']
final_df = pd.DataFrame()
final_df['shot_id_number'] = data_test['shot_id_number']
final_df['is_goal'] = pred_final

In [None]:
final_df.to_csv('submission14.csv', index =False)

In [None]:
# prev_sub = pd.read_csv('submission2.csv')
cur_sub = pd.read_csv('submission3_0.932.csv')

In [None]:
cur_sub['is_goal'].describe()

In [None]:
prev_sub['is_goal'].dtype, cur_sub['is_goal'].dtype

In [None]:
(prev_sub['is_goal'] == cur_sub['is_goal']).sum() / len(prev_sub)