In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('500hits.csv' , encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [4]:
# remove unwanted columns
df = df.drop(columns = ['PLAYER', 'CS'])

In [5]:
df.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366,1
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331,1
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345,1
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.31,1
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329,1


In [7]:
# ':' selects all rows of this data frame
# '0:13' selects column 0 to column 12
# X represents the features or input variables of the dataset
X = df.iloc[:, 0:13]

In [8]:
# y represents the target variable or output variable of the dataset
y = df.iloc[:, 13]

In [9]:
# import ML library scikit-learn
from sklearn.model_selection import train_test_split

In [10]:
# 'random_state' sets the random seed for reproducibility
# 'test_size = 0.2' means 20% of the data will be used for testing (80% for training)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.2)

In [13]:
# get info of (n_rows, n_columns) from X_train
X_train.shape

(372, 13)

In [12]:
X_test.shape

(93, 13)

In [14]:
y_train.shape

(372,)

In [15]:
y_test.shape

(93,)

In [16]:
# import library for decision tree algorithm
from sklearn.tree import DecisionTreeClassifier

In [17]:
dtc = DecisionTreeClassifier()

In [19]:
# explote the parameters
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [21]:
# 'fit()' is a mathod used to train (fit) a machine learning model to the provided training data
dtc.fit(X_train, y_train)

In [23]:
# generate prediction with X_test as input
y_pred = dtc.predict(X_test)

In [24]:
# The confusion matrix is a table that is used to describe the performance of a classification model with test data set
from sklearn.metrics import confusion_matrix

In [26]:
# in the matrix: [[true positive, false positive], [false negative, true negative]]
print(confusion_matrix(y_test, y_pred))

[[52  9]
 [10 22]]


In [27]:
# import library for classification report
from sklearn.metrics import classification_report

In [29]:
# Precision: The ratio of correctly predicted positive observations to the total predicted positives.
# Recall: The ratio of correctly predicted positive observations to the all observations in actual class.
# F1-score: The weighted average of Precision and Recall. It considers both false positives and false negatives. 
# F1 = (2*precision*recall/ (recall+precision)
# Support: The number of actual occurrences of the class in the specified dataset.

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85        61
           1       0.71      0.69      0.70        32

    accuracy                           0.80        93
   macro avg       0.77      0.77      0.77        93
weighted avg       0.79      0.80      0.79        93



In [30]:
# return an array where each element represents the importance of a feature
# higher the value, the more important the feature is for predicting the target variable
dtc.feature_importances_

array([0.0147551 , 0.03815083, 0.02492137, 0.06838822, 0.38738161,
       0.05142854, 0.04400595, 0.        , 0.10560465, 0.05944383,
       0.03117801, 0.07835735, 0.09638455])

In [31]:
X.columns

Index(['YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB',
       'BA'],
      dtype='object')

In [35]:
# 'dtc.feature_importances_' accesses the array of feature importances from the trained DecisionTreeClassifier
# 'index' specifies the index (row labels) of the DataFrame
features = pd.DataFrame(dtc.feature_importances_, index = X.columns, columns=['Importance'])

In [36]:
# test
features.head(15)

Unnamed: 0,Importance
YRS,0.014755
G,0.038151
AB,0.024921
R,0.068388
H,0.387382
2B,0.051429
3B,0.044006
HR,0.0
RBI,0.105605
BB,0.059444


In [37]:
# create a new decision tree classifier instance with specific parameters
# 'entropy' refers to the information gain criterion, 
# which measures the amount of information gained about the target variable from each split
# by default, criterion is gini impurity
# 'ccp_alpha=0.04' is used for cost complexity pruning and specifies the complexity parameter used for Minimal Cost-Complexity Pruning.

dtc2 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha=0.04)

In [38]:
dtc2.fit(X_train, y_train)

In [40]:
y_pred2 = dtc2.predict(X_test)

In [42]:
print(confusion_matrix(y_test, y_pred2))

[[50 11]
 [ 9 23]]


In [43]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83        61
           1       0.68      0.72      0.70        32

    accuracy                           0.78        93
   macro avg       0.76      0.77      0.77        93
weighted avg       0.79      0.78      0.79        93



In [46]:
features2 = pd.DataFrame(dtc2.feature_importances_, index=X.columns, columns=['Importance'])

In [47]:
features2.head(15)

Unnamed: 0,Importance
YRS,0.0
G,0.0
AB,0.0
R,0.0
H,0.837977
2B,0.0
3B,0.0
HR,0.0
RBI,0.0
BB,0.0
