In [7]:
pip install numpy pandas matplotlib scikit-learn




In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


In [9]:
dataset = pd.read_csv('/content/Dataset .csv')  # Change filename if needed
dataset.head(5) # Display first 5 rows

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


# New Section

In [10]:
print(dataset.head())  # View first few rows
print(dataset.shape)   # Get dimensions


   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   
3      SM 

In [11]:
X = dataset.iloc[:, :-1].values  # Independent variables (features)
y = dataset.iloc[:, -1].values   # Dependent variable (target)


In [12]:
print(dataset.dtypes)


Restaurant ID             int64
Restaurant Name          object
Country Code              int64
City                     object
Address                  object
Locality                 object
Locality Verbose         object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Currency                 object
Has Table booking        object
Has Online delivery      object
Is delivering now        object
Switch to order menu     object
Price range               int64
Aggregate rating        float64
Rating color             object
Rating text              object
Votes                     int64
dtype: object


In [13]:
# Separate numeric and categorical columns
numeric_cols = dataset.select_dtypes(include=['number']).columns
categorical_cols = dataset.select_dtypes(include=['object']).columns

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)


Numeric Columns: Index(['Restaurant ID', 'Country Code', 'Longitude', 'Latitude',
       'Average Cost for two', 'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')
Categorical Columns: Index(['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose',
       'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery',
       'Is delivering now', 'Switch to order menu', 'Rating color',
       'Rating text'],
      dtype='object')


In [14]:
# Identify numeric and categorical columns
numeric_cols = dataset.select_dtypes(include=['number']).columns
categorical_cols = dataset.select_dtypes(include=['object']).columns

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)


Numeric Columns: Index(['Restaurant ID', 'Country Code', 'Longitude', 'Latitude',
       'Average Cost for two', 'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')
Categorical Columns: Index(['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose',
       'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery',
       'Is delivering now', 'Switch to order menu', 'Rating color',
       'Rating text'],
      dtype='object')


In [15]:
from sklearn.impute import SimpleImputer

# Impute missing values only in numeric columns
imputer = SimpleImputer(strategy='mean')  # Works only for numeric data
dataset[numeric_cols] = imputer.fit_transform(dataset[numeric_cols])


In [16]:
X = dataset.iloc[:, :-1].values  # Features (independent variables)
y = dataset.iloc[:, -1].values   # Target (dependent variable)


In [17]:
cat_imputer = SimpleImputer(strategy='most_frequent')
dataset[categorical_cols] = cat_imputer.fit_transform(dataset[categorical_cols])


In [18]:
from sklearn.model_selection import train_test_split

# Splitting data: 80% Training, 20% Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Check the shape of the datasets
print("Training Set Shape:", X_train.shape, y_train.shape)
print("Test Set Shape:", X_test.shape, y_test.shape)


Training Set Shape: (7640, 20) (7640,)
Test Set Shape: (1911, 20) (1911,)


In [19]:
# Identify categorical columns
categorical_cols = dataset.select_dtypes(include=['object']).columns

print("Categorical Columns:", categorical_cols)


Categorical Columns: Index(['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose',
       'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery',
       'Is delivering now', 'Switch to order menu', 'Rating color',
       'Rating text'],
      dtype='object')


In [20]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

for col in categorical_cols:
    dataset[col] = labelencoder.fit_transform(dataset[col])

print("Dataset after Label Encoding:\n", dataset.head())


Dataset after Label Encoding:
    Restaurant ID  Restaurant Name  Country Code  City  Address  Locality  \
0      6317637.0             3748         162.0    73     8685       171   
1      6304287.0             3172         162.0    73     6055       593   
2      6300002.0             2896         162.0    75     4684       308   
3      6318506.0             4707         162.0    75     8690       862   
4      6314302.0             5523         162.0    75     8689       862   

   Locality Verbose   Longitude   Latitude  Cuisines  ...  Currency  \
0               172  121.027535  14.565443       920  ...         0   
1               601  121.014101  14.553708      1111  ...         0   
2               314  121.056831  14.581404      1671  ...         0   
3               875  121.056475  14.585318      1126  ...         0   
4               875  121.057508  14.584450      1122  ...         0   

   Has Table booking  Has Online delivery  Is delivering now  \
0                  1 

In [21]:
X = dataset.iloc[:, :-1].values  # Independent variables
y = dataset.iloc[:, -1].values   # Dependent variable


In [22]:
# Identify categorical columns
categorical_cols = dataset.select_dtypes(include=['object']).columns

print("Categorical Columns:", categorical_cols)


Categorical Columns: Index([], dtype='object')


In [23]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

for col in categorical_cols:
    dataset[col] = labelencoder.fit_transform(dataset[col])

print("Dataset after Label Encoding:\n", dataset.head())


Dataset after Label Encoding:
    Restaurant ID  Restaurant Name  Country Code  City  Address  Locality  \
0      6317637.0             3748         162.0    73     8685       171   
1      6304287.0             3172         162.0    73     6055       593   
2      6300002.0             2896         162.0    75     4684       308   
3      6318506.0             4707         162.0    75     8690       862   
4      6314302.0             5523         162.0    75     8689       862   

   Locality Verbose   Longitude   Latitude  Cuisines  ...  Currency  \
0               172  121.027535  14.565443       920  ...         0   
1               601  121.014101  14.553708      1111  ...         0   
2               314  121.056831  14.581404      1671  ...         0   
3               875  121.056475  14.585318      1126  ...         0   
4               875  121.057508  14.584450      1122  ...         0   

   Has Table booking  Has Online delivery  Is delivering now  \
0                  1 

In [24]:
dataset = pd.get_dummies(dataset, columns=categorical_cols, drop_first=True)
print("Dataset after One-Hot Encoding:\n", dataset.head())


Dataset after One-Hot Encoding:
    Restaurant ID  Restaurant Name  Country Code  City  Address  Locality  \
0      6317637.0             3748         162.0    73     8685       171   
1      6304287.0             3172         162.0    73     6055       593   
2      6300002.0             2896         162.0    75     4684       308   
3      6318506.0             4707         162.0    75     8690       862   
4      6314302.0             5523         162.0    75     8689       862   

   Locality Verbose   Longitude   Latitude  Cuisines  ...  Currency  \
0               172  121.027535  14.565443       920  ...         0   
1               601  121.014101  14.553708      1111  ...         0   
2               314  121.056831  14.581404      1671  ...         0   
3               875  121.056475  14.585318      1126  ...         0   
4               875  121.057508  14.584450      1122  ...         0   

   Has Table booking  Has Online delivery  Is delivering now  \
0                  

In [25]:
X = dataset.iloc[:, :-1].values  # Independent variables
y = dataset.iloc[:, -1].values   # Dependent variable


In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Split data before scaling
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Apply StandardScaler
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Scaled Training Data Sample:\n", X_train[:5])


Scaled Training Data Sample:
 [[-1.02605399  0.74339666 -0.3060509   0.41642583 -1.49501356  0.12188215
   0.09151113  0.31088231  0.2530703   0.61917523 -0.07280453  0.0166346
  -0.37046353 -0.58883794 -0.06278679  0.         -0.89163132 -1.76951159
   0.70551895  0.71464152]
 [ 1.06596351 -1.37963952 -0.3060509  -1.51722085  1.46529014  0.78193259
   0.74415593  0.3190199   0.23500192 -0.01191971 -0.05580561  0.0166346
  -0.37046353  1.69826012 -0.06278679  0.         -0.89163132  0.1511305
  -0.63950033 -1.04963936]
 [-1.02641745 -0.4736364  -0.3060509  -1.51722085  0.12750167 -1.34033488
  -1.3160362   0.31884526  0.23385054 -0.00412842 -0.04900605  0.0166346
  -0.37046353 -0.58883794 -0.06278679  0.         -0.89163132  0.34981762
  -0.63950033 -1.04963936]
 [ 1.07022858 -0.01876679 -0.3060509   0.41642583  0.05059049  0.13767283
   0.10654902  0.31480531  0.25497954  0.61917523 -0.0592054   0.0166346
  -0.37046353 -0.58883794 -0.06278679  0.         -0.89163132 -1.76951159
   0.7

In [27]:
processed_data = pd.DataFrame(X_train)
processed_data['Target'] = y_train  # Add the target variable

# Save to CSV
processed_data.to_csv('processed_dataset.csv', index=False)

# Download the file (Google Colab)
from google.colab import files
files.download('processed_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>