In [1]:
# Required libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pickle

# Load the dataset
data = sns.load_dataset('tips')

In [2]:
data['smoker'].value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

In [3]:
data['size'].value_counts()

2    156
3     38
4     37
5      5
1      4
6      4
Name: size, dtype: int64

In [4]:
data['total_bill'].value_counts()

13.42    3
13.81    2
15.98    2
17.92    2
10.07    2
        ..
24.71    1
21.16    1
28.97    1
22.49    1
18.78    1
Name: total_bill, Length: 229, dtype: int64

In [5]:
# Mapping categorical variables manually
sex_mapping = {'Male':0, 'Female':1}
smoker_mapping = {'No':0, 'Yes':1}
day_mapping = {'Thur':0, 'Fri':1, 'Sat':2, 'Sun':3}
time_mapping = {'Lunch':0, 'Dinner':1}

data['sex'] = data['sex'].map(sex_mapping)
data['smoker'] = data['smoker'].map(smoker_mapping)
data['day'] = data['day'].map(day_mapping)
data['time']=data['time'].map(time_mapping)

# Features and target variable
x = data.drop(columns=['tip'])
y = data['tip']

# Split into training and testing data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Train the model
model = RandomForestRegressor()
model.fit(x_train,y_train)

# Predictions and Evaluation
y_pred = model.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Model saving
with open('model.pkl','wb') as model_file:
  pickle.dump(model,model_file)

Mean Absolute Error: 0.7454775510204084


In [6]:
x_train.columns

Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [7]:
y_train

228    2.72
208    2.03
96     4.00
167    4.50
84     2.03
       ... 
106    4.06
14     3.02
92     1.00
179    3.55
102    2.50
Name: tip, Length: 195, dtype: float64

In [8]:
data['smoker'].value_counts()

0    151
1     93
Name: smoker, dtype: int64

In [9]:
import pandas as pd
input_data = pd.DataFrame({
    'total_bill' : [200],
    'sex' : ['Male'],
    'smoker':['No'],
    'day' : ['Thur'],
    'time' : ['Lunch'],
    'size' : [2]
})

In [10]:
sex_mapping = {'Male':0, 'Female':1}
smoker_mapping = {'No':0, 'Yes':1}
day_mapping = {'Thur':0, 'Fri':1, 'Sat':2, 'Sun':3}
time_mapping = {'Lunch':0, 'Dinner':1}

input_data['sex'] = input_data['sex'].map(sex_mapping)
input_data['smoker'] = input_data['smoker'].map(smoker_mapping)
input_data['day'] = input_data['day'].map(day_mapping)
input_data['time']=input_data['time'].map(time_mapping)
with open('model.pkl','rb') as model_file:
    m = pickle.load(model_file)
predict = m.predict(input_data)
print(predict)

[8.6755]


In [11]:
print(input_data)

   total_bill  sex  smoker  day  time  size
0         200    0       0    0     0     2
