# Data Cleaning

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix, classification_report


In [None]:
def cleaning(df:pd.DataFrame):
    df.columns = df.columns.str.strip().str.replace(" ", "_")
    df.drop_duplicates(inplace=True)
    df["Processor_Series"] = df["Processor_Series"].str.replace(" Gen1",".1").str.replace(" Gen2",".2").str.replace("Unknown","35").astype(float)
    # Clean 'memory_card_size' column: 1-Remove 'GB' and 'TB' text  2-Convert TB to GB  3-Convert the column to numerical type
    df["memory_card_size_GB"] = df["memory_card_size"].astype(str).str.replace("GB","").str.replace("TB","*1000").map(lambda X:eval(X)) # eval execute Python code
    df["memory_card_size_GB"] = df["memory_card_size_GB"].astype(int)
    df.drop(columns="memory_card_size",inplace=True)
    # Convert 'os_version' column to numerical:
    # Remove 'v' and dots
    df["os_version"] = df["os_version"].str.replace("v", "").str.replace(".", "", 1).astype(float)
    # Apply condition: if > 17, divide by 10 to correct the formatting
    df["os_version"] = df["os_version"].apply(lambda x: x / 10 if x > 17 else x)

    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

    for c in categorical_columns:
      df[c] = df[c].str.lower().str.strip()
    
    return(df)
# test

In [3]:
def replace_to_other(df,min_frq):
    for c in df:
        if df[c].dtype in ["object"]:
            count= df[c].value_counts()
            to_replace = count[count < min_frq].index
            df[c] = df[c].replace(to_replace, 'Other')
    
    return(df)


In [4]:
# Load the dataset
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

df_train= pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train = cleaning(df_train)
df_test = cleaning(df_test)

In [5]:
for c in df_train:
    if df_train[c].dtype in ["int64", "float64"]:
        df_train[c].fillna(df_train[c].mean())

    if df_train[c].dtype in ["object"]:
        df_train[c].fillna(df_train[c].mode()[0])
        df_train[c].str.replace("unknown", df_train[c].mode()[0])

for c in df_test:
    if df_test[c].dtype in ["int64", "float64"]:
        df_test[c].fillna(df_train[c].mean())

    if df_test[c].dtype in ["object"]:
        df_test[c].fillna(df_train[c].mode()[0])
        df_test[c].str.replace("unknown", df_train[c].mode()[0])




In [None]:
df_train = replace_to_other(df_train,10)

In [6]:
df_test.head()

Unnamed: 0,price,rating,Dual_Sim,4G,5G,Vo5G,NFC,IR_Blaster,Processor_Brand,Processor_Series,Core_Count,Clock_Speed_GHz,Performance_Tier,RAM_Size_GB,Storage_Size_GB,RAM_Tier,battery_capacity,fast_charging_power,Screen_Size,Resolution_Width,Resolution_Height,Refresh_Rate,Notch_Type,primary_rear_camera_mp,num_rear_cameras,primary_front_camera_mp,num_front_cameras,memory_card_support,os_name,os_version,brand,memory_card_size_GB
0,non-expensive,82.0,yes,yes,yes,no,no,no,dimensity,920.0,8.0,2.5,unknown,8.0,256.0,high-end,4500.0,60,6.4,1080.0,2400.0,90.0,punch hole,50.0,3.0,16.0,1.0,no,android,12.0,realme,1000
1,non-expensive,83.0,yes,yes,no,no,no,no,dimensity,800.0,8.0,2.4,unknown,8.0,128.0,high-end,4000.0,33,6.44,1080.0,2400.0,90.0,water drop notch,64.0,3.0,44.0,1.0,yes,android,11.0,vivo,1000
2,expensive,89.0,yes,yes,yes,no,yes,no,snapdragon,8.1,8.0,2.4,flagship,8.0,128.0,high-end,5000.0,65,6.7,1440.0,3216.0,120.0,punch hole,50.0,3.0,32.0,1.0,yes,android,12.0,realme,1000
3,non-expensive,81.0,yes,yes,yes,no,yes,yes,snapdragon,778.0,8.0,2.4,high-end,6.0,128.0,mid-range,5000.0,67,6.67,1080.0,2400.0,120.0,punch hole,108.0,3.0,16.0,1.0,no,android,13.0,poco,1000
4,non-expensive,82.0,yes,yes,yes,no,yes,yes,dimensity,8100.0,8.0,2.85,unknown,6.0,128.0,mid-range,5080.0,67,6.6,1080.0,2460.0,144.0,punch hole,64.0,3.0,16.0,1.0,yes,android,12.0,xiaomi,1000


In [9]:
df_train.head()

Unnamed: 0,price,rating,Dual_Sim,4G,5G,Vo5G,NFC,IR_Blaster,Processor_Brand,Processor_Series,Core_Count,Clock_Speed_GHz,Performance_Tier,RAM_Size_GB,Storage_Size_GB,RAM_Tier,battery_capacity,fast_charging_power,Screen_Size,Resolution_Width,Resolution_Height,Refresh_Rate,Notch_Type,primary_rear_camera_mp,num_rear_cameras,primary_front_camera_mp,num_front_cameras,memory_card_support,os_name,os_version,brand,memory_card_size_GB
0,expensive,85.0,yes,yes,yes,no,yes,no,snapdragon,870.0,8.0,3.2,flagship,12.0,256.0,flagship,4400.0,66,6.62,1080.0,2400.0,120.0,punch hole,48.0,3.0,16.0,1.0,yes,android,12.0,iqoo,1000
1,non-expensive,88.0,yes,yes,yes,no,yes,no,snapdragon,865.0,8.0,2.84,flagship,8.0,128.0,high-end,4500.0,0,6.5,1080.0,2400.0,120.0,punch hole,12.0,3.0,32.0,1.0,yes,android,10.0,samsung,1000
2,non-expensive,75.0,yes,yes,yes,no,no,no,snapdragon,4.0,8.0,2.4,unknown,4.0,64.0,mid-range,5000.0,18,6.58,1080.0,2408.0,120.0,water drop notch,50.0,2.0,8.0,1.0,yes,android,12.0,iqoo,1000
3,non-expensive,74.0,yes,yes,no,no,no,yes,helio,99.0,8.0,2.2,mid-range,4.0,64.0,mid-range,5000.0,18,6.58,1080.0,2400.0,90.0,water drop notch,50.0,3.0,8.0,1.0,yes,android,12.0,poco,512
4,non-expensive,80.0,yes,yes,no,no,no,no,snapdragon,680.0,8.0,2.4,mid-range,8.0,128.0,high-end,5000.0,44,6.58,1080.0,2408.0,90.0,water drop notch,50.0,3.0,16.0,1.0,yes,android,12.0,vivo,1000


# Visualization 

In [None]:
# Visualization code

# Feature Enginerring & Selection

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder , OrdinalEncoder
from category_encoders import TargetEncoder

In [None]:

binary_columns=['price','Dual_Sim', '4G', '5G', 'Vo5G', 'NFC', 'IR_Blaster','memory_card_support']
le=LabelEncoder()
for col in binary_columns:
    df[col]=le.fit_transform(df[col])
df.head()

In [None]:

plt.figure(figsize=(15,10))
numeric_corr = df.select_dtypes(include='number').corr()
sns.heatmap(numeric_corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (numeric features)')
plt.show()
    

In [None]:
outliers_col=['rating','Processor_Series','Core_Count','Clock_Speed_GHz',
              'RAM_Size_GB','Storage_Size_GB','battery_capacity','fast_charging_power',
              'Screen_Size','Resolution_Width','Resolution_Height','Refresh_Rate',
              'primary_rear_camera_mp','primary_front_camera_mp','num_front_cameras','memory_card_size_GB']
outliers_col_clean = [col.strip() for col in outliers_col]
for c in outliers_col_clean:
    df[f"log_{c}"] = np.log1p(df[c])

In [None]:
df['performance_score'] = df['log_Core_Count'] * df['log_Clock_Speed_GHz'] * (df['log_RAM_Size_GB'] / 4)
df['camera_quality_score'] = (df['log_primary_rear_camera_mp'] * 0.7 + df['log_primary_front_camera_mp'] * 0.3)

In [None]:
df.drop(columns=outliers_col_clean, inplace=True)

In [None]:
colms=['Processor_Brand','Performance_Tier','RAM_Tier','Notch_Type','os_name','brand']
for c in colms:
    print(f"{c}: {df[c].unique()}")
   

In [None]:
df['brand'].value_counts()

In [None]:
df['Processor_Brand'].value_counts()

In [None]:
df['Notch_Type'].value_counts()


In [None]:
df['os_name'].value_counts()

In [None]:
colms=['Processor_Brand','Performance_Tier','RAM_Tier','Notch_Type','os_name','brand']
for c in colms:
    print(f"{c}: {df[c].unique()}")
   

In [None]:
one_hot_cols = ['Processor_Brand', 'Notch_Type', 'os_name']
ohn = OneHotEncoder(drop='first', sparse_output=False)
encoded_cols = ohn.fit_transform(df[one_hot_cols])
encoded_col_names = ohn.get_feature_names_out(one_hot_cols)
encoded_df = pd.DataFrame(encoded_cols, columns=encoded_col_names)
df.reset_index(drop=True, inplace=True)
df = pd.concat([df, encoded_df], axis=1)
df.drop(columns=one_hot_cols, inplace=True)

In [None]:
tiers_order = [
	['Unknown', 'Budget', 'Low-End', 'Mid-Range', 'High-End', 'Flagship'],
	['Unknown', 'Budget', 'Low-End', 'Mid-Range', 'High-End', 'Flagship']
]

oe = OrdinalEncoder(categories=tiers_order)
df[['Performance_Tier', 'RAM_Tier']] = oe.fit_transform(df[['Performance_Tier', 'RAM_Tier']])

In [None]:
te = TargetEncoder(cols=['brand'], min_samples_leaf=20, smoothing=10)
df['brand'] = te.fit_transform(df['brand'], df['price'])

In [None]:
df.head()

In [None]:
plt.figure(figsize=(35,30))
numeric_corr = df.select_dtypes(include='number').corr()
sns.heatmap(numeric_corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (numeric features)')
plt.show()

In [None]:
# Separate features and target variable
X = df.drop('price', axis=1)
y = df['price']


In [None]:
from sklearn.feature_selection import mutual_info_regression
mi = mutual_info_regression(X, y)
mi_series = pd.Series(mi, index=X.columns)
mi_series = mi_series.sort_values(ascending=False)
print(mi_series.head(20))

plt.figure(figsize=(10,8))
mi_series.head(20).plot(kind='barh')
plt.show()


# Modeling

### Models -> 2 

In [None]:
!pip install imbalanced-learn
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from imbalanced_learn.over_sampling import SMOT

In [None]:
from sklearn.model_selection import train_test_split
#seperate train and test data from df"
# train_data=df[0:869]
# test_data=df[869:]
# x_train=train_data.drop('price',axis=1)
# y_train=train_data['price']
# x_test=test_data.drop('price',axis=1)
# y_test=test_data['price']
new_df=df[mi_series.head(30).index]    #model performance improved when using top 20 features
x=new_df
y=df["price"]
# x=df.drop(columns="price")
# y=df["price"]

In [None]:
#selecting and scaling
#selector=SelectKBest(chi2,k=150)
#x_train=selector.fit_transform(x_train,y_train)
#x_test=selector.transform(x_test)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [None]:
#using decision treem svm, random forest, xgbosting, knn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
models={
    "decision_tree":DecisionTreeClassifier(), # Changed to Classifier
    "random_forest":RandomForestClassifier(), # Changed to Classifier
    "svm":SVC(probability=True), # Changed to Classifier, probability=True for some metrics if needed
    "xgboost":XGBClassifier(use_label_encoder=False, eval_metric='logloss'), # Changed to Classifier
    "knn":KNeighborsClassifier() # Changed to Classifier
}
#put parameters for each model
parameters = {
    "decision_tree": {
        'criterion': ['gini', 'entropy'], # How the tree decides to split (Math vs Info Theory)
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None], # VERY IMPORTANT: Restricts features per split to reduce overfitting
        'class_weight': [None, 'balanced'] # Vital if you have unequal classes (e.g., 90% non-expensive, 10% expensive)
    },

    "random_forest": {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2'], # Controls diversity of trees
        'bootstrap': [True, False], # Sampling with replacement or not
        'class_weight': [None, 'balanced']
    },

    "svm": {
        'C': [0.1, 1, 10], # Regularization: Small C = simple boundary (underfit), Large C = complex boundary (overfit)
        'gamma': ['scale', 'auto', 0.1, 0.01], # Kernel coefficient: High gamma = strict fit to data points
        'kernel': ['rbf', 'poly'],
        'degree': [2, 3], # Only used if kernel is 'poly'
        'class_weight': [None, 'balanced']
    },

    "xgboost": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0], # % of rows used per tree. <1.0 prevents overfitting
        'colsample_bytree': [0.7, 0.8, 1.0], # % of columns used per tree. Great for feature selection
        'gamma': [0, 0.1, 0.2], # Minimum loss reduction required to make a split (Regularization)
        'scale_pos_weight': [1] # Change this if you have imbalanced classes (e.g., sum(negative) / sum(positive))
    },

    "knn": {
        'n_neighbors': [3, 5, 7, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski'], # How distance is calculated
        'p': [1, 2] # 1=Manhattan, 2=Euclidean (Only for Minkowski)
    }
}
def run_grid_search(models, parameters,x_train, y_train,x_test, y_test):
    results = {}
    for model_name, model in models.items():
        print(f"\nRunning GridSearchCV for {model_name}...")
        # Use 'accuracy' for classification, and make sure models are classifiers
        grid_search = GridSearchCV(model, parameters[model_name], cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
        grid_search.fit(x_train, y_train)
        results[model_name] = grid_search
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        y_pred=grid_search.predict(x_test)
        print("confusion matrix")
        print(confusion_matrix(y_test,y_pred))
        print("classification report",classification_report(y_test,y_pred))



run_grid_search(models, parameters, x_train, y_train,x_test, y_test)

Decision Tree

In [None]:
DT=DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2)
DT.fit(x_train,y_train)
y_pred=DT.predict(x_test)
print("classification report")
print(classification_report(y_test,y_pred))
#plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# import joblib
# joblib.dump(DT, 'smartphone_price_classifier_model.pkl')

SVM

In [None]:
#SVM=SVC(C=1, class_weight='balanced', degree=3, gamma='scale', kernel='poly')   with 20 features
SVM=SVC(C=1, class_weight='balanced', degree=2, gamma='scale', kernel='rbf')   #with 30 features

SVM.fit(x_train,y_train)
y_pred=SVM.predict(x_test)
print("classification report")
print(classification_report(y_test,y_pred))
#plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
# plt.show()
# joblib.dump(SVM, 'smartphone_price_svm_classifier_model.pkl')

### Models -> 2 

In [None]:
# Models code

### Models -> 1

In [None]:
# Model code