# understanding of the data

## basic imports


In [2]:
## importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

## Data Cleaning
### Handling Missing values
1. Handling Missing values
2. Handling Duplicates
3. Check data type
4. Understand the dataset

## getting to know the data

In [None]:
df.info()
df.describe()
df.isnull().sum()
df[<column_name>].value_counts()

## Remove the duplicates

```python
df.drop_duplicates(inplace=True)
```


## getting the percentage of missing value

In [None]:
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5), '% missing values')

## Cleaning column name

In [None]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')


## handiling the missing values



```markdown
# 🧠 Missing Value Imputation: Mean vs Median vs Mode

When your dataset contains missing values, you often need to fill them
(impute) to maintain data integrity.

Choosing between **mean**, **median**, or **mode** depends on the **type** and **distribution** of your data.

---

## 🧮 Mean (Average)

```python
df['column'] = df['column'].fillna(df['column'].mean())
```

#### ✅ When to use:
- The column is **numerical**.
- Data is **normally distributed** (i.e., symmetric).
- No significant **outliers**.

📌 **Example:** Heights, weights, temperatures.

---

### 📈 Median (Middle value)

```python
df['column'] = df['column'].fillna(df['column'].median())
```

#### ✅ When to use:
- The column is **numerical**.
- Data is **skewed** (i.e., has outliers).
- You want a **robust** alternative to mean.

📌 **Example:** Income, house prices, medical bills.

---

### 🔁 Mode (Most frequent value)

```python
df['column'] = df['column'].fillna(df['column'].mode()[0])
```

#### ✅ When to use:
- The column is **categorical** or contains **discrete values**.
- You want to fill missing values with the **most frequent entry**.

📌 **Example:** Gender, city, payment method.

---

### 📊 Summary Table

| Data Type       | Condition               | Recommended Method | Code Snippet                                         |
|------------------|-------------------------|---------------------|------------------------------------------------------|
| Numerical        | Symmetric, no outliers  | Mean                | `df['col'].fillna(df['col'].mean())`                |
| Numerical        | Skewed, has outliers    | Median              | `df['col'].fillna(df['col'].median())`              |
| Categorical      | Repeated common values  | Mode                | `df['col'].fillna(df['col'].mode()[0])`             |

---

📝 **Tip:** Always visualize and explore your data before choosing an imputation method. Use `.describe()` and `.hist()` or seaborn plots to understand the distribution.


## 🔍 Feature Type Classification (Numerical, Categorical, Discrete, Continuous)


In [None]:
# Feature type separation
num_features = df.select_dtypes(exclude='object').columns.tolist()
# num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
cat_features = df.select_dtypes(include='object').columns.tolist()
discrete_features = [f for f in num_features if df[f].nunique() <= 25]
continuous_features = [f for f in num_features if f not in discrete_features]

# Print counts
print(f'Numerical: {len(num_features)}, Categorical: {len(cat_features)}, Discrete: {len(discrete_features)}, Continuous: {len(continuous_features)}')


# making the dataset ready for machine learning

## ⚙️ Preprocessing: Encode Categoricals and Scale Numericals


 🔧 `remainder='passthrough'` in `ColumnTransformer`

- **What it does**: Keeps columns not explicitly transformed and **passes them through unchanged**.
- **Is it compulsory?** No, default is `'drop'`, which removes untransformed columns.
- **Use case**: Use `'passthrough'` if you want to retain those columns.

**Example**:
- `'passthrough'`: Keeps untransformed columns.
- `'drop'`: Discards untransformed columns (default).


In [None]:
# Create Column Transformer with 3 types of transformers
cat_features = X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
         ("OneHotEncoder", oh_transformer, cat_features),
          ("StandardScaler", numeric_transformer, num_features)
    ],remainder='passthrough'
)

In [None]:
# Fit the preprocessor on the training data
X_train_transformed = preprocessor.fit_transform(X_train)
# Fit the preprocessor on the training data
X_train_transformed = preprocessor.fit_transform(X_train)


## 🧪 Train-Test Split for Model Evaluation


In [None]:
from sklearn.model_selection import train_test_split
X = df.drop([<Target_col>], axis=1)
y = df[<Target_col>]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

# machine learing

## 🔍 Analyzing Model Performance on Training and Test Data


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve


models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))

    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))



    print('----------------------------------')

    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))


    print('='*35)
    print('\n')

## 🔧 Hyperparameter Tuning for Models

In [None]:
#Initialize few parameter for Hyperparamter tuning
knn_params = {"n_neighbors": [2, 3, 10, 20, 40, 50]}
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}


In [None]:
# Models list for Hyperparameter tuning
randomcv_models = [('KNN', KNeighborsRegressor(), knn_params),
                   ("RF", RandomForestRegressor(), rf_params)

                   ]

In [None]:
##Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

### after getting the best model

In [None]:
## Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features='auto', max_depth=None,
                                                     n_jobs=-1),
     "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10, n_jobs=-1)

}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    print('='*35)
    print('\n')

### roc curve

In [None]:
## Plot ROC AUC Curve
from sklearn.metrics import roc_auc_score,roc_curve
plt.figure()

# Add the models to the list that you want to view on the ROC plot
auc_models = [
{
    'label': 'Random Forest Classifier',
    'model': RandomForestClassifier(n_estimators=1000,min_samples_split=2,
                                          max_features=7,max_depth=None),
    'auc':  0.8325
},

]
# create loop through all model
for algo in auc_models:
    model = algo['model'] # select the model
    model.fit(X_train, y_train) # train the model
# Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
# Calculate Area under the curve to display on the plot
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (algo['label'], algo['auc']))
# Custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig("auc.png")
plt.show()