# Decision Tree

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [None]:
df= sns.load_dataset('titanic')
df

In [None]:
df = df[['pclass','sex','age','sibsp','parch','fare', 'survived']]


In [None]:
df

### Label Encoding

In [None]:
df= pd.get_dummies(df, columns=['sex'])

df



In [None]:
df.isnull().sum()

In [None]:
df.fillna(df['age'].mean(), inplace=True)



In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
sns.histplot(df['fare'])

In [None]:
df['fare']= np.log(df['fare'])

In [None]:
sns.histplot(df['fare'])

In [None]:
sns.boxplot(x['age'])

In [None]:
Q1=df['age'].quantile(0.25)
Q3=df['age'].quantile(0.75)

IQR=Q3-Q1

upper_bound= Q3 +1.5*IQR
lower_bound= Q1 -1.5*IQR

df=df[(df['age']< upper_bound) & (df['age']> lower_bound)]


In [None]:
sns.boxplot(df['age'])

In [158]:
from sklearn.preprocessing import MinMaxScaler # For scalling in between 0_1

Cols=['age', 'fare']
scaler= MinMaxScaler()

df[Cols]= scaler.fit_transform(df[Cols])
df

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=10)

# Random State

In [None]:
model= DecisionTreeClassifier()

In [None]:
model.fit(x_train, y_train)

In [None]:
predictions= model.predict(x_test)

In [None]:
# show tree

plt.figure(figsize=(15,10))
tree.plot_tree(model, filled=True)
plt.show()
plt.savefig('tree.pdf', format='pdf' ,dpi=300)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
accuracy_score(y_test, predictions)

# overall all cetogories

In [None]:
precision_score(y_test, predictions)



Precision: proportion of true positive predictions among all positive predictions\
Mathematically, precision is calculated as the ratio of true positives to the sum of true positives and false positives:

In [None]:
recall_score(y_test, predictions)



Recall: proportion of true positive predictions among all actual positive instances.\
recall is calculated as the ratio of true positives to the sum of true positives and false negatives:

In [None]:
f1_score(y_test, predictions)

# average(harmonic mean) of precision and recall



The formula for the harmonic mean is: n / (1/x₁ + 1/x₂ + ... + 1/xₙ), where 'n' represents the count of numbers.

For example, if we want to calculate the harmonic mean of 3, 5, and 7:
(3 + 5 + 7) / (1/3 + 1/5 + 1/7) = 15 / (1/3 + 1/5 + 1/7)

**Harmonic Mean:** It is the average of reciprocal values, which helps measure multiplicative relationships.

### Confusion Matrix

![image](Extra\24.png)


In [None]:
from sklearn.metrics import confusion_matrix

cm= confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(cm, annot=True)

plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.xticks([0, 1], ['Survived', 'Not_Survived'])
plt.yticks([0, 1], ['Survived', 'Not_Survived'])


# K Nearest Neighbour

In [None]:
from sklearn.neighbors import  KNeighborsClassifier
k_model= KNeighborsClassifier(n_neighbors=3)# This changing value
                                            # known as hyperparameter tuning

In [None]:
k_model.fit(x_train, y_train)

In [None]:
k_prediction= k_model.predict( x_test)

In [None]:
accuracy_score(y_test, k_prediction)

In [None]:
k_cm= confusion_matrix(y_test, k_prediction)

In [None]:
sns.heatmap(k_cm, annot=True)

#### Grid Search Cross Validation
for sekecting better k_no.

# Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier
model= RandomForestClassifier()
model.fit(x_train, y_train)
predictions= model.predict( x_test)
print('accuracy', accuracy_score(y_test, predictions))
print('precision', precision_score(y_test, predictions))
print('recall', recall_score(y_test, predictions))

cm= confusion_matrix(y_test, predictions)
print(sns.heatmap(cm, annot=True))



# Logistic Regression
It is a classification Model

**Sigmoid fuction :**  It divides the input into two classes


In [None]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression()
model.fit(x_train, y_train)
predictions= model.predict( x_test)
print('accuracy', accuracy_score(y_test, predictions))
print('precision', precision_score(y_test, predictions))
print('recall', recall_score(y_test, predictions))

cm= confusion_matrix(y_test, predictions)
print(sns.heatmap(cm, annot=True))



# SVM (Support Vector Machine)

![image](Extra\25.png)

In [None]:
from sklearn.svm import SVC
model= SVC()
model.fit(x_train, y_train)
predictions= model.predict( x_test)
print('accuracy', accuracy_score(y_test, predictions))
print('precision', precision_score(y_test, predictions))
print('recall', recall_score(y_test, predictions))

cm= confusion_matrix(y_test, predictions)
print(sns.heatmap(cm, annot=True))


# Naive Base

In [None]:
from sklearn.naive_bayes import GaussianNB
model= GaussianNB()
model.fit(x_train, y_train)
predictions= model.predict( x_test)
print('accuracy', accuracy_score(y_test, predictions))
print('precision', precision_score(y_test, predictions))
print('recall', recall_score(y_test, predictions))

cm= confusion_matrix(y_test, predictions)
print(sns.heatmap(cm, annot=True))


# choosing the Best Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score


In [None]:
classifiers = {
    "kNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB()
}

In [None]:
# Cross-validation
results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='accuracy')
    results[name] = scores.mean()


In [None]:
# Print mean accuracy for each classifier
for name, score in results.items():
    print(f"{name}: {score}")

In [None]:
# Select best model based on mean accuracy
best_model = max(results, key=results.get)
print(f"Best Model: {best_model}")

**On the basis of f1 score**

In [None]:
classifiers= {
    "kNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB()
}


results= {}
for name, clf in classifiers.items():
    scores= cross_val_score(clf, x_train, y_train, cv=5, scoring='f1_macro')
    results[name]= scores.mean()

for name, score in results.items():
    print(f"{name}: {score}")

best_model= max(results, key=results.get)
print(f'Best Model: {best_model}')