In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [69]:
df=sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [70]:
#predict whats the time ?if lunch or dinner
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [71]:
#EDA
df.info

<bound method DataFrame.info of      total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]>

In [72]:
#since time is nominal we use label encoder 
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [73]:
df.time.unique()

array([0, 1])

In [74]:
X=df.drop('time',axis=1)
y=df.time


In [75]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [76]:
X_train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
154,19.77,2.0,Male,No,Sun,4
167,31.71,4.5,Male,No,Sun,4
110,14.0,3.0,Male,No,Sat,2
225,16.27,2.5,Female,Yes,Fri,2


In [77]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [78]:
#handling missing value data encoding feature scaling
from sklearn.impute import SimpleImputer #for missing value

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import  StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



In [79]:
cat_cols=["sex","smoker","day"]
num_col=["total_bill","tip","size"]


In [80]:
#feature engineering asuromation using pipeline and column transformer 
num_pipeline=Pipeline(steps=[('imputation',SimpleImputer(strategy="median")),('scaling',StandardScaler())])
cat_pipeline=Pipeline(steps=[('imputation',SimpleImputer(strategy="most_frequent")),('encoding',OneHotEncoder())])

In [81]:
preprocessor=ColumnTransformer([("num_pipeline",num_pipeline,num_col),("cat_pipeline",cat_pipeline,cat_cols)])

In [82]:
# 🔄 TRANSFORM TRAINING DATA
# =========================

# Apply preprocessing pipeline to training data
print("🔄 Applying preprocessing to training data...")
X_train_transformed = preprocessor.fit_transform(X_train)

print("✅ Training data transformation complete!")
print(f"Original training shape: {X_train.shape}")
print(f"Transformed training shape: {X_train_transformed.shape}")
print(f"Features expanded from {X_train.shape[1]} to {X_train_transformed.shape[1]} due to one-hot encoding")
print()
print("💡 What happened:")
print("   • Categorical variables (sex, smoker, day) converted to binary features")
print("   • Numerical variables scaled to standard normal distribution")
print("   • Missing values imputed (if any)")
print("   • Data is now ready for machine learning models!")

🔄 Applying preprocessing to training data...
✅ Training data transformation complete!
Original training shape: (195, 6)
Transformed training shape: (195, 11)
Features expanded from 6 to 11 due to one-hot encoding

💡 What happened:
   • Categorical variables (sex, smoker, day) converted to binary features
   • Numerical variables scaled to standard normal distribution
   • Missing values imputed (if any)
   • Data is now ready for machine learning models!


In [83]:
# 🔄 TRANSFORM TEST DATA
# ======================

# Apply preprocessing pipeline to test data (using already fitted preprocessor)
print("🔄 Applying preprocessing to test data...")
X_test_transformed = preprocessor.transform(X_test)

print("✅ Test data transformation complete!")
print(f"Original test shape: {X_test.shape}")
print(f"Transformed test shape: {X_test_transformed.shape}")
print()
print("💡 Important notes:")
print("   • Used transform() not fit_transform() to avoid data leakage")
print("   • Same preprocessing applied as learned from training data")
print("   • Both datasets now have consistent feature structure")
print("   • Ready for model training and evaluation!")

🔄 Applying preprocessing to test data...
✅ Test data transformation complete!
Original test shape: (49, 6)
Transformed test shape: (49, 11)

💡 Important notes:
   • Used transform() not fit_transform() to avoid data leakage
   • Same preprocessing applied as learned from training data
   • Both datasets now have consistent feature structure
   • Ready for model training and evaluation!


In [84]:
X_train

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
154,19.77,2.00,Male,No,Sun,4
167,31.71,4.50,Male,No,Sun,4
110,14.00,3.00,Male,No,Sat,2
225,16.27,2.50,Female,Yes,Fri,2
...,...,...,...,...,...,...
137,14.15,2.00,Female,No,Thur,2
72,26.86,3.14,Female,Yes,Sat,2
140,17.47,3.50,Female,No,Thur,2
235,10.07,1.25,Male,No,Sat,2


In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
models={"support vector classifier":SVC(),"DT classififer":DecisionTreeClassifier()}


In [86]:
from sklearn.metrics import accuracy_score
def model_train_eval(X_train,y_train,X_test,y_test,models):
    evaluation={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        model_score=accuracy_score(y_test,y_pred)
        evaluation[list(models.keys())[i]]=model_score
    return evaluation

In [87]:
# 🚀 TRAIN AND EVALUATE MODELS WITH PREPROCESSED DATA
# ==================================================

# Use the TRANSFORMED data for model training and evaluation
print("🚀 Training models on preprocessed data...")
print("=" * 50)

evaluation_results = model_train_eval(
    X_train_transformed,  # Use transformed training data (NOT X_train)
    y_train, 
    X_test_transformed,   # Use transformed test data (NOT X_test)
    y_test, 
    models
)

print("📊 MODEL PERFORMANCE RESULTS:")
print("=" * 40)
for model_name, accuracy in evaluation_results.items():
    print(f"{model_name:<25}: {accuracy:.4f} ({accuracy*100:.2f}%)")

print(f"\n🎯 Best performing model: {max(evaluation_results, key=evaluation_results.get)}")
print(f"🏆 Best accuracy: {max(evaluation_results.values()):.4f} ({max(evaluation_results.values())*100:.2f}%)")

print(f"\n💡 Success! Models trained on properly preprocessed data:")
print(f"   ✅ No more 'string to float' conversion errors")
print(f"   ✅ Categorical variables properly encoded as numbers")
print(f"   ✅ Numerical features properly scaled")
print(f"   ✅ All features ready for machine learning algorithms")

🚀 Training models on preprocessed data...
📊 MODEL PERFORMANCE RESULTS:
support vector classifier: 0.9184 (91.84%)
DT classififer           : 0.9184 (91.84%)

🎯 Best performing model: support vector classifier
🏆 Best accuracy: 0.9184 (91.84%)

💡 Success! Models trained on properly preprocessed data:
   ✅ No more 'string to float' conversion errors
   ✅ Categorical variables properly encoded as numbers
   ✅ Numerical features properly scaled
   ✅ All features ready for machine learning algorithms
