##**Settings**

In [None]:
#display all columns
pd.set_option('display.max_columns', None)

## **EDA**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('kc_house_data.csv')
df.columns
df.dtypes

In [None]:
df = df.drop(axis=0,columns=['id'])
df.head()

In [None]:
df.describe(include='all')

In [None]:
#see unique values
print(df.nunique())
a=df.nunique()
for i in a[a<10].index:
    print('unique values for ', i, a[i], df[i].unique())
    print('{0} unique values for {1} - {2} '.format(a[i],i, np.sort(df[i].unique())))

In [None]:
df.groupby(['bedrooms','yr_built'])['price'].agg(['mean','count']).reset_index()   # or [['price','sqft]] for multiple col
yrprice.columns = ['_'.join(col).strip() for col in yrprice.columns.values]  ## Flatten the MultiIndex columns and rename them

In [None]:
#remove outliers
df.isnull().sum()
df.isna().sum()
df=df.dropna()
df[df['bathrooms']>df['bathrooms'].quantile(0.99)]

duplicates = train_data[train_data.duplicated(keep=False)]


In [None]:
#visualise and remove outliers
fig, axes = plt.subplots(3,3,figsize=(12,8))
sns.histplot(df, x='price', ax=axes[0,0])
sns.scatterplot(df, x='sqft_living', y='price', hue='view', ax=axes[1,0])

plt.tight_layout()
plt.show()

#other way
ls =['age','will_vote','price', 'x']
pl=1
plt.figure(figsize=(12, 8))
for i in ls:
    plt.subplot(3,3,pl)
    pl +=1
    sns.histplot(df[i])

## **Pre processing with Pipeline**

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_features = ['sqft_living','bedrooms','price']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    ,('scaler', StandardScaler())
])

categorical_features = ['waterfront','view']
categorical_transformer = Pipeline(steps= [
    ('impute', SimpleImputer(strategy='most_frequent'))
    ,('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
 X = df[['date','bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']]
y= df[['price']]
# X.loc[:, 'date'] = x['date'].str[:8]  #substring

In [None]:
X_arr_transformed = preprocessor.fit_transform(X)  #apply imputer & other preprocessing

# Convert the transformed data back to a DataFrame
categorical_feature_transformed =  preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
feature_transformed = list(numeric_features) + list(categorical_feature_transformed)

X_df_transformed = pd.DataFrame(X_arr_transformed, columns=feature_transformed)  # Create the transformed DataFrame
X_df_not_transformed = X.drop(columns=numeric_features + categorical_features).reset_index(drop=True)  # Combine with the remaining columns
X_df_final = pd.concat([X_df_not_transformed, X_df_transformed], axis=1)
X_df_final.head(10)

In [None]:
#visualise feature corellation
tmp=pd.concat([X_df_final , y], axis=1)
sns.heatmap(tmp.corr().sort_values('price') , cmap='coolwarm')     #df.select_dtypes(include='number').corr().sort_values('price')

In [None]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_df_final, y, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)

## **Regression Modelling**

In [None]:
# Models
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
models={'Linear Regression': LinearRegression()
        ,'Ridge': Ridge()
        ,'Lasso': Lasso()
        ,'NeighborsRegressor': KNeighborsRegressor()
        ,'Decision Tree': DecisionTreeRegressor()
        ,'Random Forest':RandomForestRegressor()
        ,'SVR':SVR()
        ,'AdaBoost Regressor': AdaBoostRegressor()
        }

model_params={
                "Linear Regression":{},
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],

                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [64,128],
                    'max_depth': [None, 30],
                    'min_samples_split': [2, 10],
                    'min_samples_leaf': [1,  4]
                },
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                     'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },

                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }

            }

In [None]:
from datetime import datetime
model_score=[]

for i in models:
  print('################################################', datetime.now())
  print(models.get(i), model_params.get(i, {}))
  ms = GridSearchCV(models.get(i) ,model_params.get(i, {})  ,cv=3)
  #ms = RandomizedSearchCV(models.get(i) ,model_params.get(i, {})  ,cv=3, random_state=42)
  ms.fit((x_train), np.array(y_train).ravel())
  print(ms.best_params_)

  model = models.get(i)
  model.set_params(**ms.best_params_)
  model.fit(x_train, np.array(y_train).ravel())
  y_train_pred = model.predict(x_train)
  y_test_pred = model.predict(x_test)

  model_train_r2 = r2_score(y_train, y_train_pred)
  model_test_r2 = r2_score(y_test, y_test_pred)
  model_score.append({
    'Model': str(model),
    'R² Train': r2_score(y_train, y_train_pred),
    'R2 Test':r2_score(y_test, y_test_pred),
    'MSE Train': mean_squared_error(y_train, y_train_pred),
    'MSE Test': mean_squared_error(y_test, y_test_pred),
    'RMSE Train': np.sqrt(mean_squared_error(y_train, y_train_pred)),
    'RMSE Test': np.sqrt(mean_squared_error(y_test, y_test_pred)),
    'MAETrain': mean_absolute_error(y_train, y_train_pred),
    'MAE Test': mean_absolute_error(y_test, y_test_pred)
  })
  print('train score {0} | test score {1}'.format(model_train_r2, model_test_r2))
pd.DataFrame(model_score).sort_values('R2 Test',ascending=False)

In [None]:
model = RandomForestRegressor(n_estimators= 50,max_depth= 100,  min_samples_split= 20, min_samples_leaf= 10)
model.fit((x_train), np.array(y_train).ravel())
y_train_pred = model.predict((x_train))
y_test_pred = model.predict(x_test)
model_train_r2 = r2_score(y_train, y_train_pred)
model_test_r2 = r2_score(y_test, y_test_pred)
print('train score {0} test score {1}'.format(model_train_r2, model_test_r2))

In [None]:
#display / verify result with invert transform for onehot and scale
y_pred_df = pd.DataFrame(y_test_pred, columns=['Pred_Production'])

categorical_feature_transformed =  preprocessor.named_transformers_['cat']['encode'].get_feature_names_out(cat_features)
feature_transformed = list(num_features) + list(categorical_feature_transformed)

num_inverse_transformed = preprocessor.named_transformers_['num']['scale'].inverse_transform(X_test[num_features])
num_inverse_transformed_df = pd.DataFrame(num_inverse_transformed, columns=num_features)
cat_inverse_transformed = preprocessor.named_transformers_['cat']['encode'].inverse_transform(X_test[categorical_feature_transformed])
cat_inverse_transformed_df = pd.DataFrame(cat_inverse_transformed, columns=cat_features)

X_test_verify = pd.concat([y_test.reset_index(), y_pred_df , num_inverse_transformed_df, cat_inverse_transformed_df ], axis=1)
X_test_verify['diff'] = abs(X_test_verify['Production'] - X_test_verify['Pred_Production'])/X_test_verify['Production']
X_test_verify[X_test_verify['Production'] > 0].sort_values('diff', ascending=False).head(10)

In [None]:
df_plot= X_test_verify[X_test_verify['diff']<250]
sns_plot = sns.kdeplot(df_plot['diff'] , bw_adjust=0.5)

x_values = sns_plot.lines[0].get_xdata()
y_values = sns_plot.lines[0].get_ydata()
plt.fill_between(x_values, 0, y_values, where=(x_values >= -20) & (x_values <= 20),
                 color='grey', alpha=0.5)

from scipy.integrate import simps
total_area = simps(y_values, x_values)
shaded_area = simps(y_values[(x_values >= -20) & (x_values <= 20)],
                    x_values[(x_values >= -20) & (x_values <= 20)])
percentage_shaded = (shaded_area / total_area) * 100
plt.text(0.05, 0.9, f'{percentage_shaded:.2f}% of the area', transform=plt.gca().transAxes)

## **Multi Classification**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import hamming_loss, accuracy_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
from datetime import datetime

# Define the models and their hyperparameters
models = {
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB()
}

model_params = {
    'Random Forest': { },
    'Gradient Boosting': {},
    'AdaBoost': {},
    'Logistic Regression': {},
    'Support Vector Machine': {},
    'Naive Bayes': {}
}

In [None]:
model_score = []

for name, model in models.items():
    print('################################################', datetime.now())
    print(name)

    ms = GridSearchCV(MultiOutputClassifier(model), model_params.get(name, {}), cv=3, scoring='accuracy')   ##### MultiOutputClassifier(model) is used
    ms.fit(X_train, y_train)
    print(ms.best_params_)

    best_model = ms.best_estimator_
    best_model.fit(X_train, y_train)

    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    model_train_hamming = hamming_loss(y_train, y_train_pred)
    model_test_hamming = hamming_loss(y_test, y_test_pred)
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='macro')
    model_test_f1 = f1_score(y_test, y_test_pred, average='macro')

    model_score.append({
        'Model': name,
        'Hamming Loss Train': model_train_hamming,
        'Hamming Loss Test': model_test_hamming,
        'Accuracy Train': model_train_accuracy,
        'Accuracy Test': model_test_accuracy,
        'F1 Score Train': model_train_f1,
        'F1 Score Test': model_test_f1
    })

    print(f'train score - Accuracy: {model_train_accuracy}, F1 Score: {model_train_f1}')
    print(f'test score - Accuracy: {model_test_accuracy}, F1 Score: {model_test_f1}')

results_df = pd.DataFrame(model_score).sort_values('F1 Score Test', ascending=False)
results_df

In [None]:
basemodel = GradientBoostingClassifier()
model = MultiOutputClassifier(basemodel)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

model_test_hamming = hamming_loss(y_test, y_test_pred)
model_test_accuracy = accuracy_score(y_test, y_test_pred)
model_test_f1 = f1_score(y_test, y_test_pred, average='macro')

print('test_accuracy {0} test f1 {1}'.format(model_test_accuracy, model_test_f1))

In [None]:
#display / verify result
y_test_pred_df = pd.DataFrame(y_test_pred, columns=feature_names)
y_test_pred_df = y_test_pred_df.rename(columns={col: 'pred_' + col for col in y_test_pred_df.columns})

y_test_df = pd.DataFrame(hot.inverse_transform(y_test), columns=['candidate_test'])
y_pred_df = pd.DataFrame(hot.inverse_transform(y_test_pred_df), columns=['candidate_pred'])

X_test_verify = pd.concat([y_test_df, y_pred_df , X_test.reset_index() ], axis=1)

In [None]:
#confusion_matrix  -  X_test_verify.groupby(['candidate_test', 'candidate_pred']).size()
from sklearn.metrics import confusion_matrix

X_test_verify_cm = X_test_verify.dropna()
cm = confusion_matrix(X_test_verify_cm['candidate_test'], X_test_verify_cm['candidate_pred'], labels=X_test_verify_cm['candidate_test'].unique())

cm_df = pd.DataFrame(cm, index=X_test_verify_cm['candidate_test'].unique(), columns=X_test_verify_cm['candidate_test'].unique())
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')