In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder



In [3]:
o_data  = pd.read_csv('data/data.csv', header=0)
test_data = o_data[o_data['shot_made_flag'].isnull()]

test_data.info()
train_data = o_data[~o_data['shot_made_flag'].isnull()]
train_data.info()
full_data = [train_data, test_data]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 30693
Data columns (total 25 columns):
action_type           5000 non-null object
combined_shot_type    5000 non-null object
game_event_id         5000 non-null int64
game_id               5000 non-null int64
lat                   5000 non-null float64
loc_x                 5000 non-null int64
loc_y                 5000 non-null int64
lon                   5000 non-null float64
minutes_remaining     5000 non-null int64
period                5000 non-null int64
playoffs              5000 non-null int64
season                5000 non-null object
seconds_remaining     5000 non-null int64
shot_distance         5000 non-null int64
shot_made_flag        0 non-null float64
shot_type             5000 non-null object
shot_zone_area        5000 non-null object
shot_zone_basic       5000 non-null object
shot_zone_range       5000 non-null object
team_id               5000 non-null int64
team_name             5000 non-null object

In [4]:
shot_ids = full_data[1]['shot_id']
shot_ids.head()

0      1
7      8
16    17
19    20
32    33
Name: shot_id, dtype: int64

## Feature Engineering

### 投篮类型

In [5]:
train_data[['shot_type', 'shot_made_flag']].groupby('shot_type', as_index=False).mean()

Unnamed: 0,shot_type,shot_made_flag
0,2PT Field Goal,0.477348
1,3PT Field Goal,0.329268


### 投篮距离, 影响较大的因素

In [6]:
train_data[['shot_distance', 'shot_made_flag']].groupby('shot_distance', as_index=False).mean()

Unnamed: 0,shot_distance,shot_made_flag
0,0,0.634766
1,1,0.660920
2,2,0.532091
3,3,0.435737
4,4,0.473186
5,5,0.434685
6,6,0.427289
7,7,0.421227
8,8,0.477113
9,9,0.431818


### 投篮类型.粒度太细,某些类型的投篮次数只有个位数

In [7]:
# action_type_group = train_data[['action_type', 'shot_made_flag']].groupby('action_type', as_index=False)
# action_type_group.describe()

In [8]:
g = train_data[['combined_shot_type', 'shot_made_flag']].groupby('combined_shot_type', as_index=False)
print(g.mean())
g.count()

  combined_shot_type  shot_made_flag
0          Bank Shot        0.791667
1               Dunk        0.928030
2          Hook Shot        0.535433
3          Jump Shot        0.391071
4              Layup        0.565093
5           Tip Shot        0.348684


Unnamed: 0,combined_shot_type,shot_made_flag
0,Bank Shot,120
1,Dunk,1056
2,Hook Shot,127
3,Jump Shot,19710
4,Layup,4532
5,Tip Shot,152


In [9]:
train_data[['shot_zone_area', 'shot_made_flag']].groupby('shot_zone_area', as_index=False).mean()


Unnamed: 0,shot_zone_area,shot_made_flag
0,Back Court(BC),0.013889
1,Center(C),0.525556
2,Left Side Center(LC),0.361177
3,Left Side(L),0.396871
4,Right Side Center(RC),0.382567
5,Right Side(R),0.401658


In [10]:
train_data[['shot_zone_basic', 'shot_made_flag']].groupby('shot_zone_basic', as_index=False).mean()


Unnamed: 0,shot_zone_basic,shot_made_flag
0,Above the Break 3,0.329237
1,Backcourt,0.016667
2,In The Paint (Non-RA),0.454381
3,Left Corner 3,0.370833
4,Mid-Range,0.406286
5,Restricted Area,0.618004
6,Right Corner 3,0.339339


In [11]:
train_data[['opponent', 'shot_made_flag']].groupby('opponent', as_index=False).mean()

Unnamed: 0,opponent,shot_made_flag
0,ATL,0.452055
1,BKN,0.4
2,BOS,0.411239
3,CHA,0.436
4,CHI,0.430233
5,CLE,0.439689
6,DAL,0.454017
7,DEN,0.45784
8,DET,0.441227
9,GSW,0.464567


除了后面几个赛季,其他赛季命中率相差都不大

In [12]:
train_data[['season', 'shot_made_flag']].groupby('season', as_index=False).mean()


Unnamed: 0,season,shot_made_flag
0,1996-97,0.422977
1,1997-98,0.430864
2,1998-99,0.458824
3,1999-00,0.460366
4,2000-01,0.466667
5,2001-02,0.458431
6,2002-03,0.436285
7,2003-04,0.43326
8,2004-05,0.436557
9,2005-06,0.453742


In [13]:
def get_host_or_guest(s):
    return 1 if '@' in s else 0
for dataset in full_data:
    dataset['is_guest'] = dataset['matchup'].apply(get_host_or_guest)
    dataset['shot_type'] = dataset['shot_type'].map({'2PT Field Goal':0, '3PT Field Goal': 1})
#     ohe = OneHotEncoder(categorical_features=[2, 3, 5, 6, 7])
#     dataset = ohe.fit_transform(dataset).toarray()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### 主客场

In [14]:
train_data[['is_guest', 'shot_made_flag']].groupby('is_guest', as_index=False).mean()

Unnamed: 0,is_guest,shot_made_flag
0,0,0.456468
1,1,0.436421


In [44]:
    
selected_f = ['shot_type', 'shot_distance','season', 'opponent', 'is_guest','shot_zone_area', 'shot_zone_basic', 'combined_shot_type']
y = train_data['shot_made_flag']
train_data = train_data[selected_f]
test_data = test_data[selected_f]

In [175]:
class myNb:
    def __init__(self):
        self.cProb = {}
        self.cCondProb = {}
    def fit(self, X_train, y_train):
        import numpy as np

        # 生还与否的概率
        t = len(y_train)
        # 生还和死者总数
        cNumber = {}
        cValues = np.unique(y_train)
        #类概率
        for cv in cValues:
            cNumber[cv] = (y_train == cv).sum()
            self.cProb[cv] = cNumber[cv] / t
        #求类条件概率
        for col in X_train:
            if col not in self.cCondProb:
                self.cCondProb[col] = {}
            for cv in self.cProb:
                if cv not in self.cCondProb[col]:
                    self.cCondProb[col][cv] = {}
                for fv in np.unique(X_train[col]):
                    self.cCondProb[col][cv][fv] = ((X_train[col] == fv) & (y_train == cv)).sum() / cNumber[cv]
                    if self.cCondProb[col][cv][fv] == 0:
                        self.cCondProb[col][cv][fv] = 1e-10

    def predict_proba(self, X_test):
        y_pred = []
        P = [0] * len(self.cProb)
        Plog = [0] * len(self.cProb)
        for cv in range(0, len(self.cProb)):
            Plog[cv] = np.log(self.cProb[cv])
        for i in X_test.index:
            for cv in range(0, len(self.cProb)):
                P[cv] = Plog[cv]
                for col in X_test:
#                     print(i, col)
                    if self.cCondProb[col][cv][X_test.loc[i][col]] == 0:
                        print(i, col)
                    P[cv] += np.log(self.cCondProb[col][cv][X_test.loc[i][col]])
            P = np.exp(P)
            P = P / sum(P)
#             print(i,P)
#             y_pred.append(max(P, key=P.get))
#             np.append(y_pred, [P])
            y_pred.append(P.tolist())
        return np.array(y_pred)

    # def predict_proba(self, X_test):

In [73]:
train_data.index
# test_data.index

Int64Index([    1,     2,     3,     4,     5,     6,     8,     9,    10,
               11,
            ...
            30685, 30687, 30688, 30689, 30690, 30691, 30692, 30694, 30695,
            30696],
           dtype='int64', length=25697)

In [176]:
nb = myNb()
nb.fit(train_data, y)

In [156]:
print(nb.cProb, nb.cCondProb)

{0.0: 0.55383896952951706, 1.0: 0.44616103047048294} {'season': {0.0: {'2005-06': 0.073847667228780209, '1996-97': 0.015528386734120292, '1999-00': 0.049747048903878585, '2001-02': 0.064994378864530633, '2014-15': 0.025997751545812253, '2009-10': 0.068015739179314222, '2007-08': 0.067945474985947163, '2013-14': 0.0024592467678471049, '2004-05': 0.044617762788083194, '2010-11': 0.059162450815064646, '2003-04': 0.054595278246205733, '2012-13': 0.050590219224283306, '2008-09': 0.069210230466554237, '2011-12': 0.057124789207419896, '2015-16': 0.042158516020236091, '2000-01': 0.059021922428330521, '1997-98': 0.032391793142214728, '2002-03': 0.073355817875210796, '2006-07': 0.060146149522203485, '1998-99': 0.0290893760539629}, 1.0: {'2005-06': 0.076144788486698645, '1996-97': 0.014129960750109027, '1999-00': 0.052682075883122549, '2001-02': 0.068294810292193628, '2014-15': 0.01945050152638465, '2009-10': 0.070126471870911464, '2007-08': 0.074313126907980809, '2013-14': 0.0020933275185346708,

In [207]:
start = 10000
offset = 1000
y_pred = nb.predict_proba(train_data.iloc[start:start+offset])
# print(y_pred)
print('log loss: %.2f' % log_loss(y[start:start+offset].values, y_pred))
y_pred_class = (y_pred.T[0]<y_pred.T[1]) * 1
print('Accuracy: %.2f' % accuracy_score(y[start:start+offset].values, y_pred_class))

log loss: 0.76
Accuracy: 0.63


In [220]:
print(y_pred[0:10])
y_pred_alter = y_pred - (y_pred-0.5) * 0.5
print(y_pred_alter[0:10])
print('log loss: %.2f' % log_loss(y[start:start+offset].values, y_pred_alter))


[[ 0.71323094  0.28676906]
 [ 0.01151631  0.98848369]
 [ 0.01151631  0.98848369]
 [ 0.7122238   0.2877762 ]
 [ 0.7197713   0.2802287 ]
 [ 0.68306044  0.31693956]
 [ 0.88610881  0.11389119]
 [ 0.71323094  0.28676906]
 [ 0.70073459  0.29926541]
 [ 0.59996924  0.40003076]]
[[ 0.60661547  0.39338453]
 [ 0.25575815  0.74424185]
 [ 0.25575815  0.74424185]
 [ 0.6061119   0.3938881 ]
 [ 0.60988565  0.39011435]
 [ 0.59153022  0.40846978]
 [ 0.6930544   0.3069456 ]
 [ 0.60661547  0.39338453]
 [ 0.60036729  0.39963271]
 [ 0.54998462  0.45001538]]
log loss: 0.66


In [193]:
# print(y_pred[0:10])
# print(1-y_pred[0:10])
(y_pred.T[0]<y_pred.T[1]) * 1
# y[start:start+offset].values[0:10]

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0,

In [15]:
# class_le = LabelEncoder()
    
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)


X=train_data.values
y=y.values

In [16]:
print(train_data.columns)


Index(['shot_type', 'shot_distance', 'is_guest', 'season_1996-97',
       'season_1997-98', 'season_1998-99', 'season_1999-00', 'season_2000-01',
       'season_2001-02', 'season_2002-03', 'season_2003-04', 'season_2004-05',
       'season_2005-06', 'season_2006-07', 'season_2007-08', 'season_2008-09',
       'season_2009-10', 'season_2010-11', 'season_2011-12', 'season_2012-13',
       'season_2013-14', 'season_2014-15', 'season_2015-16', 'opponent_ATL',
       'opponent_BKN', 'opponent_BOS', 'opponent_CHA', 'opponent_CHI',
       'opponent_CLE', 'opponent_DAL', 'opponent_DEN', 'opponent_DET',
       'opponent_GSW', 'opponent_HOU', 'opponent_IND', 'opponent_LAC',
       'opponent_MEM', 'opponent_MIA', 'opponent_MIL', 'opponent_MIN',
       'opponent_NJN', 'opponent_NOH', 'opponent_NOP', 'opponent_NYK',
       'opponent_OKC', 'opponent_ORL', 'opponent_PHI', 'opponent_PHX',
       'opponent_POR', 'opponent_SAC', 'opponent_SAS', 'opponent_SEA',
       'opponent_TOR', 'opponent_UTA', 'opp

In [49]:
# rf = RandomForestClassifier()

import matplotlib.pyplot as plt
import seaborn as sns
classifiers = [
#     KNeighborsClassifier(3),
#     SVC(probability=True),
#     DecisionTreeClassifier(),
#     RandomForestClassifier(),
    AdaBoostClassifier(),
#     GradientBoostingClassifier(),
#     GaussianNB(),
    MultinomialNB(),
    LinearDiscriminantAnalysis(),
#     QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]
# classifiers = [rf]
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
acc_dict = {}
log_cols = ["Classifier", "Accuracy"]
log 	 = pd.DataFrame(columns=log_cols)


for train_index, test_index in sss.split(X, y):
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = y[train_index], y[test_index]
	
	for clf in classifiers:
		name = clf.__class__.__name__
		clf.fit(X_train, y_train)
# 		train_predictions = clf.predict(X_test)
# 		acc = accuracy_score(y_test, train_predictions)
		train_predictions_proba = clf.predict_proba(X_test)
		acc = log_loss(y_test, train_predictions_proba)
		if name in acc_dict:
			acc_dict[name] += acc
		else:
			acc_dict[name] = acc
for clf in acc_dict:
	acc_dict[clf] = acc_dict[clf] / 10.0
	log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
	log = log.append(log_entry)
    
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")




<matplotlib.axes._subplots.AxesSubplot at 0x7eff511695c0>

In [50]:
log

Unnamed: 0,Classifier,Accuracy
0,LogisticRegression,0.65092
0,MultinomialNB,0.813672
0,AdaBoostClassifier,0.690522
0,LinearDiscriminantAnalysis,0.651967


In [27]:
log

Unnamed: 0,Classifier,Accuracy
0,LogisticRegression,0.613385
0,DecisionTreeClassifier,0.541712
0,AdaBoostClassifier,0.614864
0,LinearDiscriminantAnalysis,0.614086
0,RandomForestClassifier,0.555525


In [42]:
import time
candidate_classifier = LogisticRegression()
candidate_classifier.fit(X, y)
result = candidate_classifier.predict_proba(test_data.values)

submission = pd.DataFrame({'shot_id': shot_ids, 'shot_made_flag':result.T[1]})
submission.to_csv('LR_' + str(time.time()) +'.csv', index=False)

In [43]:
submission.to_csv('LR_' + str(time.time()) +'.csv', index=False)