# Data preparation for Restaurant AI

This data is provided by kaggle. Zomato 2022 Kolkata data

In [92]:
import pandas as pd
import numpy as np

### Function to extract location:

In [93]:
def extract_parts_of_loc(address, index):
    """
    Extracts parts from location

    Args:
        address (string): Takes in the whole string address
        index (int): The index to determine which part of the address to extract.

    Returns:
        str: The extracted location part of the address.

    Example: 
        address = "1, Middleton Row, Near Loreto House, Park Street Area, Kolkata"
        
        overall_location = extract_location(address,-1)
        -> Kolkata

        overall_location = extract_location(address,-2)
        ->Park Street Area
    """
    # Split the address into parts
    address_parts = address.split(', ')

    #print(address_parts)

    if index==-1:
        loc = "".join(address_parts[index:])
    else:
        # Identify the relevant part (e.g., the last two elements)
        loc = "".join(address_parts[index:index+1])
    
    return loc

In [94]:
def extract_location(address):
    # Split the address into parts
    address_parts = address.split(', ')
    
    # Identify the relevant part (e.g., the last two elements)
    relevant_part = ', '.join(address_parts[-2:])
    
    return relevant_part

# Example usage
address = "1, Middleton Row, Near Loreto House, Park Street Area, Kolkata"
overall_location = extract_location(address)
print(overall_location)


Park Street Area, Kolkata


## Import table:

In [95]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'Zkolkata.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)


### Visualize
Visualize the top 5 entries

In [96]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun)
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun)
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri..."
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)"
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun)


In [97]:
# Example: Get unique values of a specific column
unique_values_column = df['name'].unique()
unique_values_column[5:20]

array(['KFC', 'Tung Fong', 'Arsalan', 'Kusum Rolls', "Kareem's",
       'WOW! China', 'Golden Spoon', 'BarBQ', 'The Kebab Factory',
       'Patisserie By Franziska', 'The Sixth Sense', 'Food King',
       'Burgrill', 'Zainab Biryani', 'The Crepe Cafe'], dtype=object)

In [98]:
# Apply extract location in top 100 rows
address = df['address']
df['Location'] = address[0:100].apply(extract_location)

In [99]:
# Example: Get unique values of a specific column
unique_values_column = df['name'].unique()
unique_values_column[5:20]

array(['KFC', 'Tung Fong', 'Arsalan', 'Kusum Rolls', "Kareem's",
       'WOW! China', 'Golden Spoon', 'BarBQ', 'The Kebab Factory',
       'Patisserie By Franziska', 'The Sixth Sense', 'Food King',
       'Burgrill', 'Zainab Biryani', 'The Crepe Cafe'], dtype=object)

In [100]:
# Convert 'address' column to strings
df['address'] = df['address'].astype(str)

### Adding another column called Location
Adding another column called location which will contain the area not exact address of the restaurent

In [101]:
# Specify the chunk size
chunk_size = 100

# Create an empty 'Location' column
df['Location'] = ''

# Loop through chunks
for i in range(0, len(df['address']), chunk_size):
    chunk_addresses = df['address'].iloc[i:i + chunk_size]
    
    # Apply the extract_location function to the chunk and assign values to the 'Location' column
    df.loc[i:i + chunk_size - 1, 'Location'] = chunk_addresses.apply(extract_location)

In [102]:
# Visualize data
starting_index = 1000
df[starting_index:starting_index+5]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
1000,The Anchorage Bar - The Floatel Hotel,53 votes,3.3,"9/10, Floatel, Kolkata Jetty, Strand Road, Dal...","North Indian, Mughlai, Chinese","₹2,000",1pm – 11pm (Mon-Sun),"Dalhousie BBD Bagh, Kolkata"
1001,All Day Dinning,Cuisines:,NEW,"20B, Paddapukur Road, Bhawanipur, Kolkata","North Indian, Mughlai, Chinese",₹900,8:30am – 11pm (Mon-Sun),"Bhawanipur, Kolkata"
1002,Cakerica,12 votes,3.3,"37A, Justice Chandra Madhab Road, Bhawanipur, ...",South Indian,₹100,9am – 9pm (Mon-Sun),"Bhawanipur, Kolkata"
1003,Mughal Garden Restaurant,204 votes,3.8,"10, Kiran Shankar Roy Road, Near Millenium Par...",South Indian,₹400,11am – 11pm (Mon-Sun),"Dalhousie BBD Bagh, Kolkata"
1004,The Bengal Ghorana,Cuisines:,-,"1/3A, Rammoy Road, Near Health Point Nursing H...","Rolls, Chinese","₹2,000",24 Hours (Mon-Sun),"Bhowanipore, Kolkata"


## Analyzing rating column and cleaning
Analyzing the rating column and replacing the null entries

In [103]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_ratings = df['rating'].unique()

# Display the unique ratings
print("Unique Ratings:", unique_ratings)

Unique Ratings: ['4.2' '4.9' '4.4' '4.6' '4.1' '4.3' '4.0' '3.8' '3.6' 'NEW' nan '3.3'
 '3.7' '3.9' '3.5' '3.2' '3.1' '3.0' '2.9' '-' '3.4' '2.3' '4.5' '2.8'
 '4.7' '2.7' '2.6' '2.5' '2.1' '4.8' '2.4' '2.2' '1.9' '2.0']


Replace `NEW` and `-` ratings with `0` and `nan` with `-1`

In [104]:
# Assuming df is your DataFrame
# Replace 'NEW' with '0' and 'nan' with '-1' in the entire DataFrame
df.replace({'NEW': '0', np.nan: '-1', '-': '0'}, inplace=True)

# Now, the 'ratings' column should only contain numeric strings, and you can proceed with sorting
# Assuming 'ratings' is the column containing ratings
df['rating'] = df['rating'].astype(float)



In [105]:
# Sort the DataFrame based on the 'ratings' column
#df.sort_values(by='rating', inplace=True)

# Display the DataFrame
df

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun),"Park Street Area, Kolkata"
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun),"Park Street Area, Kolkata"
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...","Park Street Area, Kolkata"
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)","Park Street Area, Kolkata"
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun),"Park Street Area, Kolkata"
...,...,...,...,...,...,...,...,...
7383,WOW! ARBA,Cuisines:,0.0,Delivery Only,"Chinese, Fast Food",₹150,11am – 11pm (Mon-Sun),Delivery Only
7384,Renu's,50 votes,2.8,"89/109, Vivekananda Road, Bangur Park, Hooghly...",Bengali,₹350,11am – 11pm (Mon-Sun),"Rishra, Kolkata"
7385,The Food Place,Cuisines:,0.0,"119, Ramkrishna Road, Radhashree, Landmark- Ai...",Bengali,₹150,8am – 11:45pm (Mon-Sun),"Dum Dum, Kolkata"
7386,New Dalma Restaurant,Cuisines:,0.0,Delivery Only,Chinese,₹250,12noon – 10:30pm (Mon-Sun),Delivery Only


In [106]:
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
5000,Al Zaffran,88 votes,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",₹500,11:30am – 10:30pm (Mon-Sun),"Tara Tala, Kolkata"
5001,The Kitchen,420 votes,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",₹900,12noon – 11pm (Mon-Sun),"Lake Town, Kolkata"
5002,Foodizm,149 votes,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",₹500,12noon – 10pm (Mon-Sun),"Baranagar, Kolkata"
5003,Mehek-E-Dawat,117 votes,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",₹400,12noon – 10:30pm (Mon-Sun),"Behala, Kolkata"
5004,Hot Spot,38 votes,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,₹300,12:30pm – 10pm (Mon-Sun),"Dum Dum, Kolkata"


## Analyzing unique locations

In [107]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_loc = df['Location'].unique()

# Display the unique ratings
print("Unique Locations:", unique_loc)

Unique Locations: ['Park Street Area, Kolkata' 'New Market Area, Kolkata'
 'Russel Street Area, Kolkata' 'Ripon Street, Kolkata' 'Delivery Only'
 ' Park Street Area, Kolkata' 'Near Park Circus Area, Kolkata'
 'Mirza Ghalib Street Park Street Area, Kolkata'
 'Park Circus Area, Kolkata' 'Wellesley, Kolkata'
 'Camac Street Area, Kolkata' 'Elgin, Kolkata' 'Entally, Kolkata'
 'Loudon Street Area, Kolkata' 'Theatre Road, Kolkata'
 'Esplanade, Kolkata' 'Minto Park, Kolkata' 'Chowringhee, Kolkata'
 'Taltala, Kolkata' '43B ripon street Kolkata'
 'Beck Bagan Park Circus Area, Kolkata'
 'IDBI Bank Building, New Market Area Kolkata' 'Near Elgin, Kolkata'
 'Camac Street, Kolkata' '18, Hemanta Basu Sarani kolkata' 'nan'
 'Nandalal Basu Sarani, Kolkata' 'Lord Sinha Road Elgin, Kolkata'
 '\n Taltala, Kolkata' 'Circus Avenue Park Circus Area, Kolkata'
 'Near Entally, Kolkata' 'Ballygunge, Kolkata' 'Bhawanipur, Kolkata'
 'Paddapukur, Kolkata' 'Sealdah Area, Kolkata' 'Chandni Chowk, Kolkata'
 'Bara Bazar

### Splitting Location
Column location needs to be split into two parts City and location

I have created a helper function called `extract_part_of_loc` it can be used to extract parts of location

In [108]:
# Update the 'City' column with values
for i in range(len(df)):
    df.loc[i, 'City'] = extract_parts_of_loc(df.loc[i, 'address'], -1)

In [109]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun),"Park Street Area, Kolkata",Kolkata
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun),"Park Street Area, Kolkata",Kolkata
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...","Park Street Area, Kolkata",Kolkata
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)","Park Street Area, Kolkata",Kolkata
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun),"Park Street Area, Kolkata",Kolkata


Removing City names after commas from Location column

In [110]:
# Update the 'Location' column with values
for i in range(len(df)):
    df.loc[i, 'Location'] = extract_parts_of_loc(df.loc[i, 'address'], -2)

In [111]:
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
5000,Al Zaffran,88 votes,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",₹500,11:30am – 10:30pm (Mon-Sun),Tara Tala,Kolkata
5001,The Kitchen,420 votes,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",₹900,12noon – 11pm (Mon-Sun),Lake Town,Kolkata
5002,Foodizm,149 votes,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",₹500,12noon – 10pm (Mon-Sun),Baranagar,Kolkata
5003,Mehek-E-Dawat,117 votes,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",₹400,12noon – 10:30pm (Mon-Sun),Behala,Kolkata
5004,Hot Spot,38 votes,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,₹300,12:30pm – 10pm (Mon-Sun),Dum Dum,Kolkata


### VoteCount field
Cleaning the unneccesary stuff present in voteCount column

In [112]:
# Remove 'votes' from the 'voteCount' column
df['voteCount'] = df['voteCount'].str.replace(' votes', '')

# Display the DataFrame
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
5000,Al Zaffran,88,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",₹500,11:30am – 10:30pm (Mon-Sun),Tara Tala,Kolkata
5001,The Kitchen,420,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",₹900,12noon – 11pm (Mon-Sun),Lake Town,Kolkata
5002,Foodizm,149,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",₹500,12noon – 10pm (Mon-Sun),Baranagar,Kolkata
5003,Mehek-E-Dawat,117,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",₹400,12noon – 10:30pm (Mon-Sun),Behala,Kolkata
5004,Hot Spot,38,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,₹300,12:30pm – 10pm (Mon-Sun),Dum Dum,Kolkata


In [113]:
# Remove '₹' from the 'cost' column
df['cost'] = df['cost'].str.replace('₹', '')

# Display the DataFrame
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
5000,Al Zaffran,88,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",500,11:30am – 10:30pm (Mon-Sun),Tara Tala,Kolkata
5001,The Kitchen,420,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",900,12noon – 11pm (Mon-Sun),Lake Town,Kolkata
5002,Foodizm,149,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",500,12noon – 10pm (Mon-Sun),Baranagar,Kolkata
5003,Mehek-E-Dawat,117,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",400,12noon – 10:30pm (Mon-Sun),Behala,Kolkata
5004,Hot Spot,38,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,300,12:30pm – 10pm (Mon-Sun),Dum Dum,Kolkata


In [114]:
# Assuming df is your processed DataFrame
df.to_csv('Zomato_processed_data.csv', index=False)