# For a given set of training data examples stored in a .CSV file, implement and demonstrate the conversion of categorical data to numeric of Tips.csv file using python libraries..
Dataset: https://www.kaggle.com/datasets/hnazari8665/tipscsv 

# import necessary libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
#load the dataset
df=pd.read_csv('/kaggle/input/categorical-data/tips.csv')

#display the first few rows of the dataset to understand the structure
print(df.head())

   total_bill   tip     sex smoker  day    time  size  price_per_person  \
0       16.99  1.01  Female     No  Sun  Dinner     2              8.49   
1       10.34  1.66    Male     No  Sun  Dinner     3              3.45   
2       21.01  3.50    Male     No  Sun  Dinner     3              7.00   
3       23.68  3.31    Male     No  Sun  Dinner     2             11.84   
4       24.59  3.61  Female     No  Sun  Dinner     4              6.15   

           Payer Name         CC Number Payment ID  
0  Christy Cunningham  3560325168603410    Sun2959  
1      Douglas Tucker  4478071379779230    Sun4608  
2      Travis Walters  6011812112971322    Sun4458  
3    Nathaniel Harris  4676137647685994    Sun5260  
4        Tonya Carter  4832732618637221    Sun2251  


In [4]:
df.shape

(244, 11)

In [5]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.888197,2563496000000000.0
std,8.902412,1.383638,0.9511,2.914234,2369340000000000.0
min,3.07,1.0,1.0,2.88,60406790000.0
25%,13.3475,2.0,2.0,5.8,30407310000000.0
50%,17.795,2.9,2.0,7.255,3525318000000000.0
75%,24.1275,3.5625,3.0,9.39,4553675000000000.0
max,50.81,10.0,6.0,20.27,6596454000000000.0


In [6]:
df.isnull()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
239,False,False,False,False,False,False,False,False,False,False,False
240,False,False,False,False,False,False,False,False,False,False,False
241,False,False,False,False,False,False,False,False,False,False,False
242,False,False,False,False,False,False,False,False,False,False,False


In [7]:
df.isnull().sum()

total_bill          0
tip                 0
sex                 0
smoker              0
day                 0
time                0
size                0
price_per_person    0
Payer Name          0
CC Number           0
Payment ID          0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   Payer Name        244 non-null    object 
 9   CC Number         244 non-null    int64  
 10  Payment ID        244 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 21.1+ KB


In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [10]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.0,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.0,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672


# Identify Categorical Columns

In [11]:
#display column types to find which ones are categorical
print(df.dtypes)

#alternatively,youcan manually list the column
categorical_column = ['sex','smoker','day','time'] #example categorical in tips dataset

total_bill          float64
tip                 float64
sex                  object
smoker               object
day                  object
time                 object
size                  int64
price_per_person    float64
Payer Name           object
CC Number             int64
Payment ID           object
dtype: object


# Convert Categorical Data to Numeric

# Label Encoding For Ordinary Categories

In [12]:
#Apply label encoding
label_encoder = LabelEncoder()

df['sex'] = label_encoder.fit_transform(df['sex'])
df['smoker'] = label_encoder.fit_transform(df['smoker'])
df['day'] = label_encoder.fit_transform(df['day'])
df['time'] = label_encoder.fit_transform(df['time'])

#display the transformed dataframe

print(df.head())

   total_bill   tip  sex  smoker  day  time  size  price_per_person  \
0       16.99  1.01    0       0    2     0     2              8.49   
1       10.34  1.66    1       0    2     0     3              3.45   
2       21.01  3.50    1       0    2     0     3              7.00   
3       23.68  3.31    1       0    2     0     2             11.84   
4       24.59  3.61    0       0    2     0     4              6.15   

           Payer Name         CC Number Payment ID  
0  Christy Cunningham  3560325168603410    Sun2959  
1      Douglas Tucker  4478071379779230    Sun4608  
2      Travis Walters  6011812112971322    Sun4458  
3    Nathaniel Harris  4676137647685994    Sun5260  
4        Tonya Carter  4832732618637221    Sun2251  


# One-Hot Encoding for Nominal  Categorical

In [13]:
#apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=['sex','smoker','day','time'],drop_first=True)

#display the transformed dataframe
print(df_encoded.head())

   total_bill   tip  size  price_per_person          Payer Name  \
0       16.99  1.01     2              8.49  Christy Cunningham   
1       10.34  1.66     3              3.45      Douglas Tucker   
2       21.01  3.50     3              7.00      Travis Walters   
3       23.68  3.31     2             11.84    Nathaniel Harris   
4       24.59  3.61     4              6.15        Tonya Carter   

          CC Number Payment ID  sex_1  smoker_1  day_1  day_2  day_3  time_1  
0  3560325168603410    Sun2959  False     False  False   True  False   False  
1  4478071379779230    Sun4608   True     False  False   True  False   False  
2  6011812112971322    Sun4458   True     False  False   True  False   False  
3  4676137647685994    Sun5260   True     False  False   True  False   False  
4  4832732618637221    Sun2251  False     False  False   True  False   False  
