NAME: Supreeth Hassan Ravindra, 
CWID: 20020533, 
HOMEWORK-6: C50 and Random Forest methodology

In [1]:
# Importing Required Libraries and Packages
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
df=pd.read_csv("breast-cancer-wisconsin.csv") #Loading the dataset
df= pd.DataFrame(df) 
df.head() #Displaying first 5 records

Unnamed: 0,Sample,F1,F2,F3,F4,F5,F6,F7,F8,F9,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
#Summary of the dataset
summary = df.describe()
print(summary)

             Sample          F1          F2          F3          F4  \
count  6.990000e+02  699.000000  699.000000  699.000000  699.000000   
mean   1.071704e+06    4.417740    3.134478    3.207439    2.806867   
std    6.170957e+05    2.815741    3.051459    2.971913    2.855379   
min    6.163400e+04    1.000000    1.000000    1.000000    1.000000   
25%    8.706885e+05    2.000000    1.000000    1.000000    1.000000   
50%    1.171710e+06    4.000000    1.000000    1.000000    1.000000   
75%    1.238298e+06    6.000000    5.000000    5.000000    4.000000   
max    1.345435e+07   10.000000   10.000000   10.000000   10.000000   

               F5          F7          F8          F9       Class  
count  699.000000  699.000000  699.000000  699.000000  699.000000  
mean     3.216023    3.437768    2.866953    1.589413    2.689557  
std      2.214300    2.438364    3.053634    1.715078    0.951273  
min      1.000000    1.000000    1.000000    1.000000    2.000000  
25%      2.000000   

In [4]:
df=df[df != '?'].dropna()  # drops NaN values 
print(df[df.isna().any(axis=1)]) # shows NaN values in data frame if it exists

Empty DataFrame
Columns: [Sample, F1, F2, F3, F4, F5, F6, F7, F8, F9, Class]
Index: []


In [5]:
# Splitting the dataset
X = df.drop(columns = ['Class','Sample'])    #feature(attributes)
y = df['Class']      #target variable
X.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [6]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [7]:
# Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [8]:
#C50 methodology
c50_model = DecisionTreeClassifier(criterion='entropy')

# Train the classifier
c50_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = c50_model.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the C50 methodology model:', accuracy*100, "%")

Accuracy of the C50 methodology model: 95.1219512195122 %


In [9]:
#Confusion matrix and Classification report of the model
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:')
print(cm)
print('Classification Report of the C50 model:')
print(classification_report(y_test, y_pred))

Confusion matrix:
[[129   3]
 [  7  66]]
Classification Report of the C50 model:
              precision    recall  f1-score   support

           2       0.95      0.98      0.96       132
           4       0.96      0.90      0.93        73

    accuracy                           0.95       205
   macro avg       0.95      0.94      0.95       205
weighted avg       0.95      0.95      0.95       205



In [10]:
#Random Forest methodology and the evaluation of the classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test,rf_pred)
print("Accuracy of Random Forest model: ",accuracy_rf*100,"%")

Accuracy of Random Forest model:  98.53658536585365 %


In [11]:
#Confusion matrix and Classification report of the model
rfm = confusion_matrix(y_test, rf_pred)
print('Confusion matrix:')
print(rfm)
print('Classification Report of the Random Forest model:')
print(classification_report(y_test, rf_pred))

Confusion matrix:
[[129   3]
 [  0  73]]
Classification Report of the Random Forest model:
              precision    recall  f1-score   support

           2       1.00      0.98      0.99       132
           4       0.96      1.00      0.98        73

    accuracy                           0.99       205
   macro avg       0.98      0.99      0.98       205
weighted avg       0.99      0.99      0.99       205



In [12]:
# Identifying important features
feature_importance = rf.feature_importances_
name_of_feature = X.columns
feature_imp_data = pd.DataFrame({'feature':name_of_feature,'Importance':feature_importance})

In [13]:
feature_imp_data = feature_imp_data.sort_values(by = 'Importance',ascending = False)

In [14]:
feature_imp_data

Unnamed: 0,feature,Importance
1,F2,0.309099
2,F3,0.220844
5,F6,0.157261
7,F8,0.097594
6,F7,0.092953
0,F1,0.051348
4,F5,0.034994
3,F4,0.026879
8,F9,0.009027
