In [None]:
!pip install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo
data_cancer=fetch_ucirepo(id=17)
X=data_cancer.data.features
Y=data_cancer.data.targets
X.info()
print(Y.info())

In [None]:
Y['Diagnosis'].unique()

In [None]:
X.isnull().sum()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
le=LabelEncoder()
y_encoded=le.fit_transform(Y)
X_train,X_test,Y_train,Y_test=train_test_split(X,y_encoded,test_size=0.2,
                                               random_state=42)

In [None]:
select_features=SelectKBest(chi2,k=10)
X_train_selected=select_features.fit_transform(X_train,Y_train)
X_test_selected=select_features.transform(X_test)

In [None]:
print(X_train_selected.shape)
print(X_test_selected.shape)

In [None]:
lr_model=LogisticRegression(
penalty='l1',#elasticnet l1 and l2
C=1.0,
    fit_intercept=True,
    random_state=None,
    solver='liblinear',
    max_iter=1000,
    multi_class='auto',
    verbose=0,
    warm_start=False,
    n_jobs=None
)

In [None]:
lr_model.fit(X_train,Y_train)

In [None]:
y_pred=lr_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,ConfusionMatrixDisplay,accuracy_score,confusion_matrix

In [None]:
accuracy=accuracy_score(Y_test,y_pred)
print("Accuracy is:",accuracy)

In [None]:
print(classification_report(Y_test,y_pred))

In [None]:
cm=confusion_matrix(Y_test,y_pred)
cmat=ConfusionMatrixDisplay(cm,display_labels=np.unique(Y).tolist())
cmat.plot()

In [None]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
features_data=X.columns
dt=DecisionTreeClassifier(
criterion='gini',#or entropy
splitter='best',#None
max_depth=10,#or integer
min_samples_split=2,#minimum 2 sample to decide
min_samples_leaf=1,
max_features=None, #auto,sqrt,l2,None
random_state=None)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,y_encoded,test_size=0.2,random_state=42)
X_train.shape

In [None]:
dt.fit(X_train,Y_train)

In [None]:
y_pred_dt=dt.predict(X_test)

In [None]:
print(classification_report(Y_test,y_pred_dt))
print(accuracy_score(Y_test,y_pred_dt))

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(28,10))
plot_tree(dt,filled=True,feature_names=X_train.columns.tolist(),
          class_names=np.unique(Y).tolist())
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(
    n_estimators=100, #number of trees smaller dataset with fewer features 50 Larger dataset or high dimension 200 or 500
    criterion='gini', #default is gini or use entropy info gain dataset class imbalance or categorical features
    max_depth=None, # small dataset 5 or 10 prevent overfitting larger dataset None
    min_samples_split=2, # nosiy data 5 or 10
    min_samples_leaf=1, #for most of the dataset
    max_features='sqrt',#perform best split of features or log2 or fixed
    bootstrap=True,#small dataset make it to be false
    random_state=None    
    )

In [None]:
rf.fit(X_train,Y_train)
rf_pred=rf.predict(X_test)

In [None]:
important_feature=rf.feature_importances_
print(important_feature)

In [None]:
indices=np.argsort(important_feature)[::-1]
indices

In [None]:
X.columns[indices.tolist()]

In [None]:
indices.tolist()
col=[]
for i in indices.tolist():
    print(X.columns[i])
    col.append(X.columns[i])

In [None]:
plt.figure(figsize=(18,10))
plt.title('Feature Importance')
plt.bar(col,important_feature[indices],color='b',align='center')
plt.xticks(rotation=90)
plt.show()

In [None]:
print(classification_report(Y_test,rf_pred))
print("Accuracy score", accuracy_score(Y_test,rf_pred))
cm=confusion_matrix(Y_test,rf_pred)
cmat=ConfusionMatrixDisplay(cm,display_labels=np.unique(Y).tolist())
cmat.plot()

In [None]:
single_tree=rf.estimators_[1] #second tree to be visulaized
plt.figure(figsize=(18,8))
plot_tree(single_tree,filled=True,feature_names=X_train.columns.tolist(),
         class_names=np.unique(Y).tolist())
plt.show()

In [None]:
from sklearn.svm import SVC
svc=SVC(C=1.0,
       kernel='rbf',
       degree=3,
       gamma='scale',
       coef0=0.0,
       shrinking=True,
       probability=False,
       cache_size=200,
       class_weight=None,#[0.2,0.4] or balanced
        verbose=False,
        max_iter=1000,
        decision_function_shape='ovr',
        break_ties=False,
        random_state=None
        )

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train_scaled=sc.fit_transform(X_train_selected)
X_test_scaled=sc.transform(X_test_selected)
svc.fit(X_train_scaled,Y_train)
s_pred=svc.predict(X_test_scaled)
print(confusion_matrix(Y_test,s_pred))
print(accuracy_score(Y_test,s_pred))