### Importing the Labraries

In [71]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize)
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
(LinearDiscriminantAnalysis as LDA,
QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [72]:
# Loading the dataset
Smarket = pd.read_csv('Smarket.csv')
Smarket


AttributeError: 'Index' object has no attribute '_format_flat'

      Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
0     2001  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
1     2001  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
2     2001  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
3     2001 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
4     2001  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
...    ...    ...    ...    ...    ...    ...      ...    ...       ...
1245  2005  0.422  0.252 -0.024 -0.584 -0.285  1.88850  0.043        Up
1246  2005  0.043  0.422  0.252 -0.024 -0.584  1.28581 -0.955      Down
1247  2005 -0.955  0.043  0.422  0.252 -0.024  1.54047  0.130        Up
1248  2005  0.130 -0.955  0.043  0.422  0.252  1.42236 -0.298      Down
1249  2005 -0.298  0.130 -0.955  0.043  0.422  1.38254 -0.489      Down

[1250 rows x 9 columns]

In [73]:
Smarket.columns

Index(['Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today',
       'Direction'],
      dtype='object')

In [74]:
# Smarket.corr()

In [75]:
# Smarket.plot(y='Volume');

###  Logistic Regression

In [76]:
# Building a logistic regression model using all predictors from the Smarket dataset (excluding 'Today', 'Direction', and 'Year') to predict whether the stock market goes "Up" or "Down."
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
design = MS(allvars)
X = design.fit_transform(Smarket)
y = Smarket.Direction == 'Up'
glm = sm.GLM(y,
X,
family=sm.families.Binomial())
results = glm.fit()
summarize(results)

AttributeError: 'Index' object has no attribute '_format_flat'

             coef  std err      z  P>|z|
intercept -0.1260    0.241 -0.523  0.601
Lag1      -0.0731    0.050 -1.457  0.145
Lag2      -0.0423    0.050 -0.845  0.398
Lag3       0.0111    0.050  0.222  0.824
Lag4       0.0094    0.050  0.187  0.851
Lag5       0.0103    0.050  0.208  0.835
Volume     0.1354    0.158  0.855  0.392

In [77]:
# retrieveing the estimated coefficients (parameters) of the logistic regression model for each predictor in the dataset, showing the effect of each variable on the probability of the outcome.
results.params

intercept   -0.126000
Lag1        -0.073074
Lag2        -0.042301
Lag3         0.011085
Lag4         0.009359
Lag5         0.010313
Volume       0.135441
dtype: float64

In [78]:
# retrieveing the p-values for each coefficient in the logistic regression model, indicating the statistical significance of each predictor in determining the outcome.
results.pvalues

intercept    0.600700
Lag1         0.145232
Lag2         0.398352
Lag3         0.824334
Lag4         0.851445
Lag5         0.834998
Volume       0.392404
dtype: float64

In [79]:
# Calculating the predicted probabilities of the outcome (the probability of the stock market going "Up") for all observations in the dataset based on the logistic regression model.
probs = results.predict()
probs[:10]

array([0.50708413, 0.48146788, 0.48113883, 0.51522236, 0.51078116,
       0.50695646, 0.49265087, 0.50922916, 0.51761353, 0.48883778])

In [80]:
# Creates an array called labels with 1,250 elements, all initially set to "Down". It then updates the elements where the predicted probability (probs) is greater than 0.5 to "Up"
labels = np.array(['Down']*1250)
labels[probs >0.5] = "Up"

### Classifcation

In [81]:
# Creating a confusion matrix to compare the predicted labels (labels) with the actual stock market direction (Smarket.Direction).
confusion_table(labels, Smarket.Direction)


AttributeError: 'Index' object has no attribute '_format_flat'

Truth      Down   Up
Predicted           
Down        145  141
Up          457  507

In [82]:
# Calculating the accuracy of the model's predictions by summing the number of correct predictions (True Positives and True Negatives) and dividing by the total number of observations (1,250).
(507+145)/1250, np.mean(labels == Smarket.Direction)

(0.5216, 0.5216)

In [83]:
# Splitting the Smarket dataset into training and testing sets based on the year, allowing to use data from 2004 and earlier for training and data from 2005 and later for testing.
train = (Smarket.Year < 2005)
Smarket_train = Smarket.loc[train]
Smarket_test = Smarket.loc[~train]
Smarket_test.shape

(252, 9)

In [84]:
# Splitting the dataset into training and testing sets for predictors (X) and target variable (y), fits a logistic regression model to the training data, and then calculates the predicted probabilities for the test data.
X_train , X_test = X.loc[train], X.loc[~train]
y_train , y_test = y.loc[train], y.loc[~train]
glm_train = sm.GLM(y_train ,
X_train ,
family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)

In [85]:
# Dividing the dataset into training and testing subsets for predictors (X) and response variable (y), fits a generalized linear model (logistic regression) to the training data, and then uses the model to predict probabilities for the test set.
X_train , X_test = X.loc[train], X.loc[~train]
y_train , y_test = y.loc[train], y.loc[~train]
glm_train = sm.GLM(y_train ,
X_train ,
family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)

In [86]:
# Extracting the Direction column from the Smarket DataFrame into a variable D and then splits it into training labels (L_train) and testing labels (L_test) based on the previously defined train boolean index, where L_train contains the labels for years before 2005 and L_test contains the labels for years 2005 and onward.
D = Smarket.Direction
L_train , L_test = D.loc[train], D.loc[~train]

In [87]:
# Creating an array labels initialized with 252 elements, all set to "Down", and then updates the entries to "Up" for the indices where the predicted probabilities (probs) exceed 0.5, indicating a predicted outcome of "Up" for those observations.
labels = np.array(['Down']*252)
labels[probs >0.5] = 'Up'
confusion_table(labels, L_test)

AttributeError: 'Index' object has no attribute '_format_flat'

Truth      Down  Up
Predicted          
Down         77  97
Up           34  44

In [88]:
# Calculating the model's accuracy as the proportion of correct predictions (np.mean(labels == L_test)) and the error rate as the proportion of incorrect predictions (np.mean(labels != L_test)).
np.mean(labels == L_test), np.mean(labels != L_test)


(0.4801587301587302, 0.5198412698412699)

### K-Nearest Neighbors

In [89]:
# Initializing a K-Nearest Neighbors classifier with one neighbor, fits the model to the training data (X_train and L_train), makes predictions on the test set (X_test), and then generates a confusion matrix to compare the predicted labels (knn1_pred) with the actual test labels (L_test).
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train , L_train)
knn1_pred = knn1.predict(X_test)
confusion_table(knn1_pred , L_test)

AttributeError: 'Index' object has no attribute '_format_flat'

Truth      Down  Up
Predicted          
Down         50  62
Up           61  79

In [90]:
# Calculating the K-Nearest Neighbors model's accuracy as the proportion of correct predictions, with the first expression computing accuracy using counts of True Positives and True Negatives, while the second uses a direct comparison of predicted labels against actual labels.
(83+43)/252, np.mean(knn1_pred == L_test)


(0.5, 0.5119047619047619)

In [91]:
# Initializing a K-Nearest Neighbors classifier with three neighbors, fits the model to the training data (X_train and L_train), makes predictions on the test set (X_test), and calculates the accuracy by finding the proportion of correct predictions (np.mean(knn3_pred == L_test)).
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3_pred = knn3.fit(X_train , L_train).predict(X_test)
np.mean(knn3_pred == L_test)

0.503968253968254

In [92]:
# loading the Caravan dataset, extracts the Purchase column, and counts the occurrences of each unique value in that column to show the distribution of purchase outcomes.
Caravan = load_data('Caravan')
Purchase = Caravan.Purchase
Purchase.value_counts()

Purchase
No     5474
Yes     348
Name: count, dtype: int64

In [93]:
# Calculating the proportion of purchases made in the dataset
348 / 5822

0.05977327378907592

In [94]:
# Creating a new DataFrame feature_df by dropping the Purchase column from the Caravan dataset, leaving only the features used for modeling.
feature_df = Caravan.drop(columns=['Purchase'])

In [95]:
# Initializing a StandardScaler to standardize features by centering them to mean zero and scaling to unit variance, while keeping the original data unchanged.
scaler = StandardScaler(with_mean=True,
with_std=True,
copy=True)


In [96]:
# Placing the StandardScaler to the feature_df (computing the mean and standard deviation for scaling), and then transforms feature_df by standardizing its features, storing the scaled data in X_std.
scaler.fit(feature_df)
X_std = scaler.transform(feature_df)


In [97]:
# Creating a new DataFrame feature_std using the standardized data (X_std) while preserving the original column names from feature_df. It then calculates and displays the standard deviation of each feature in feature_std, which should be approximately 1 for all features since the data has been standardized.
feature_std = pd.DataFrame(
X_std,
columns=feature_df.columns);
feature_std.std()


MOSTYPE     1.000086
MAANTHUI    1.000086
MGEMOMV     1.000086
MGEMLEEF    1.000086
MOSHOOFD    1.000086
              ...   
AZEILPL     1.000086
APLEZIER    1.000086
AFIETS      1.000086
AINBOED     1.000086
ABYSTAND    1.000086
Length: 85, dtype: float64

In [98]:
# Splitting the standardized features (feature_std) and the target variable (Purchase) into training and testing sets using an 80-20 split, with 1,000 observations allocated to the test set.
(X_train ,
X_test,
y_train ,
y_test) = train_test_split(feature_std ,
Purchase ,
test_size=1000,
random_state=0)

In [99]:
# Calculating the K-Nearest Neighbors model's misclassification rate (np.mean(y_test != knn1_pred)) and the baseline rate of positive instances in the test set (np.mean(y_test != "No")) for comparison.
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1_pred = knn1.fit(X_train , y_train).predict(X_test)
np.mean(y_test != knn1_pred), np.mean(y_test != "No")


(0.111, 0.067)

In [100]:
# Generating a confusion matrix comparing the predicted labels (knn1_pred) from the K-Nearest Neighbors model with the actual labels (y_test), summarizing the model’s performance by displaying counts of True Positives, True Negatives, False Positives, and False Negatives.
confusion_table(knn1_pred , y_test)

AttributeError: 'Index' object has no attribute '_format_flat'

Truth       No  Yes
Predicted          
No         880   58
Yes         53    9

In [101]:
# Calculating the proportion of true positives to the total positive predictions

9/(53+9)


0.14516129032258066

### Tuning Parameters

In [102]:
# Evaluating the performance of K-Nearest Neighbors classifiers with different 𝐾 values, providing insights into how the choice of 𝐾 affects prediction outcomes and accuracy.
for K in range(1,6):
  knn = KNeighborsClassifier(n_neighbors=K)
  knn_pred = knn.fit(X_train , y_train).predict(X_test)
  C = confusion_table(knn_pred , y_test)
  templ = ('K={0:d}: # predicted to rent: {1:>2},' +
          ' # who did rent {2:d}, accuracy {3:.1%}')
  pred = C.loc['Yes'].sum()
  did_rent = C.loc['Yes','Yes']
  print(templ.format(
        K,
        pred,
        did_rent ,
        did_rent / pred))

K=1: # predicted to rent: 62, # who did rent 9, accuracy 14.5%
K=2: # predicted to rent:  6, # who did rent 1, accuracy 16.7%
K=3: # predicted to rent: 20, # who did rent 3, accuracy 15.0%
K=4: # predicted to rent:  4, # who did rent 0, accuracy 0.0%
K=5: # predicted to rent:  7, # who did rent 1, accuracy 14.3%
