### Decision Tree Model with PCA Dimensionality Reduction

PCA, Principal Component Analysis, transforms a large set of variables to a smaller set but still contains most of the information from the larger set. Done by changing the number of dimensions in the output. 

In [101]:
import numpy as np
import pandas as pd

diabetes_df = pd.read_csv('../week_13/diabetes.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [102]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

#Standardize
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [105]:
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pca = PCA(n_components=6) 

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

# decision tree classifier
# model = tree.DecisionTreeClassifier(max_depth = 4,random_state=42).fit(X_train_pca,y_train)
# print(model.score(X_train_pca,y_train))

model = tree.DecisionTreeClassifier(max_depth = 4,random_state=42).fit(X_test_pca,y_test)
print(model.score(X_test_pca,y_test))


0.8636363636363636


In [106]:
y_pred = model.predict(X_test_pca)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       100
           1       0.78      0.85      0.81        54

    accuracy                           0.86       154
   macro avg       0.85      0.86      0.85       154
weighted avg       0.87      0.86      0.86       154



## SVD 

Reduces matrix to its constituent parts. Works for sparse data, so data with a lot of null values. 

In [99]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=7)

X_train_svd=svd.fit_transform(X_train)
X_test_svd=svd.fit_transform(X_test)

# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 4,random_state=42).fit(X_test_svd,y_test)

y_pred = model.predict(X_test_svd)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       100
           1       0.78      0.85      0.81        54

    accuracy                           0.86       154
   macro avg       0.85      0.86      0.85       154
weighted avg       0.87      0.86      0.86       154



## LDA 

Number of dimensions is limited to 1 and C-1, so for binary classification problems, the dimension is 1. 

In [100]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=1)

X_train_lda = lda.fit_transform(X_train,y_train)
X_test_lda = lda.fit_transform(X_test,y_test)

# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 4,random_state=42).fit(X_test_lda,y_test)

y_pred = model.predict(X_test_lda)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.71      0.82       100
           1       0.64      0.96      0.77        54

    accuracy                           0.80       154
   macro avg       0.81      0.84      0.80       154
weighted avg       0.86      0.80      0.80       154



In [63]:
def ip(address):
    ip_address_split = address.split(".")
    if len(ip_address_split) == 4:
        for i in ip_address_split:
            if int(i) > 255 or int(i) < 0:
                return False
        return True
    else:
        return False

In [64]:
ip('2.33.555.5')

False