### <div align="center">Data cleaning</div>

In [None]:
# To find null value in dataframe for each column
df.isna().sum()
# Drop na value
df.dropna(inplace=True)

# Distinct value and their count 
df['Category'].value_counts()

# To analyze a column in a dataframe
df.column_name.describe()

# To fill the NA value with mean/median/mode
df['column_name'].fillna(df['column_name'].mean(), inplace=True)

# Replace space with underscore and convert to lower case
df.columns = df.columns.str.replace(" ","_").str.lower()

# Handle Duplicates
df.duplicated().sum()
df.drop_duplicates(inplace=True)

# Replace space with underscor and convert field to lower case
df.columns = df.columns.str.replace(" ","_").str.lower()

# Filter a column with condition and get unique value out of it
df[df['number_of_dependants']<0]['number_of_dependants'].unique()

# Get absolute value of a column in dataframe
df['number_of_dependants'] = df['number_of_dependants'].abs()

# Get columns which is of float64 or int64
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
# Get all the columns
df.columns

# Outlier Treatment: Income Column
quantile_thresold = df1.income_lakhs.quantile(0.999) # Output: 100.0

# Analyse Categorical Columns and get unique value for each column
categorical_cols = ['gender', 'region', 'marital_status', 'bmi_category', 'smoking_status', 'employment_status', 'income_level', 'medical_history', 'insurance_plan']
for col in categorical_cols:
    print(col, ":", df2[col].unique())

### <div align="center">Feature Engineering</div>

In [None]:
# Add a column by applying a transformation
# Encode the categorical variable 'smoking_status' by converting 'Yes' to 1 and 'No' to 0.
df['smoking_status'] = df['smoking_status'].apply(lambda x: 1 if x == 'Yes' else 0)

#Map the field value
df['Gender'] = df['Gender'].map({'male': 1,  'female': 2})

# add artificial genetical_risk column to the dataframe
df['Genetical_Risk'] = 0

# Define the risk scores for each condition
risk_scores = {"diabetes": 6, "heart disease": 8, "high blood pressure":6, "thyroid": 5, "no disease": 0, "none":0}
df2[['disease1', 'disease2']] = df2['medical_history'].str.split(" & ", expand=True).apply(lambda x: x.str.lower())
df2['disease1'] = df2['disease1'].fillna('none')
df2['disease2'] = df2['disease2'].fillna('none')
Diseases = ['disease1', 'disease2']
df2['total_risk_score'] = 0
for disease in Diseases:
    df2['total_risk_score'] += df2[disease].map(risk_scores)

# Encode Text Columns
df2['insurance_plan'] = df2['insurance_plan'].map({'Gold': 3,'Silver': 2,'Bronze': 1})

# Scale the column with min max scalar
X = df4.drop('annual_premium_amount', axis='columns')
y = df4['annual_premium_amount']

from sklearn.preprocessing import MinMaxScaler

cols_to_scale = ['age','number_of_dependants', 'income_level',  'income_lakhs', 'insurance_plan', 'genetical_risk']
scaler = MinMaxScaler()

X[cols_to_scale] = scaler.fit_transform(X[cols_to_scale])
X.describe()

# Calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data):
    vif_df = pd.DataFrame()
    vif_df['Column'] = data.columns
    vif_df['VIF'] = [variance_inflation_factor(data.values,i) for i in range(data.shape[1])]
    return vif_df

calculate_vif(X)

# we will drop income_lakhs due to high VIF value
X_reduced = X.drop('income_level', axis="columns")

- Decision tree (gini and entropy) is important because it is used in ensimble learning model like RandomForest xgboost etc.

<h3 align="center" style="color:blue">Model Training</h3>

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.30, random_state=10)

# Linear Regression Model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
test_score = model_lr.score(X_test, y_test)
train_score = model_lr.score(X_train, y_train)
train_score, test_score

# Ridge Regression
model_rg = Ridge(alpha=1)
model_rg.fit(X_train, y_train)
test_score = model_rg.score(X_test, y_test)
train_score = model_rg.score(X_train, y_train)
train_score, test_score

# XGBoost
from xgboost import XGBRegressor

model_xgb = XGBRegressor(n_estimators=20, max_depth=3)
model_xgb.fit(X_train, y_train)
model_xgb.score(X_test, y_test)

In [None]:
# RandomizedSearchCV
model_xgb = XGBRegressor()
param_grid = {
    'n_estimators': [20, 40, 50],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}
random_search = RandomizedSearchCV(model_xgb, param_grid, n_iter=10, cv=3, scoring='r2', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
random_search.best_score_

random_search.best_params_

In [None]:
# Export the Model
from joblib import dump

dump(best_model, "artifacts/model_rest.joblib")
scaler_with_cols = {
    'scaler': scaler,
    'cols_to_scale': cols_to_scale
}
dump(scaler_with_cols, "artifacts/scaler_rest.joblib")