# Data preparation for Restaurant AI

This data is provided by kaggle. Zomato 2022 Kolkata data

In [83]:
import pandas as pd
import numpy as np

### Function to extract location:

In [97]:
def extract_parts_of_loc(address, index):
    """
    Extracts parts from location

    Args:
        address (string): Takes in the whole string address
        index (int): The index to determine which part of the address to extract.

    Returns:
        str: The extracted location part of the address.

    Example: 
        address = "1, Middleton Row, Near Loreto House, Park Street Area, Kolkata"
        
        overall_location = extract_location(address,-1)
        -> Kolkata

        overall_location = extract_location(address,-2)
        ->Park Street Area
    """
    # Split the address into parts
    address_parts = address.split(', ')

    print(address_parts)

    if index==-1:
        loc = "".join(address_parts[index:])
    else:
        # Identify the relevant part (e.g., the last two elements)
        loc = "".join(address_parts[index:index+1])
    
    return loc

In [1]:
def extract_location(address):
    # Split the address into parts
    address_parts = address.split(', ')
    
    # Identify the relevant part (e.g., the last two elements)
    relevant_part = ', '.join(address_parts[-2:])
    
    return relevant_part

# Example usage
address = "1, Middleton Row, Near Loreto House, Park Street Area, Kolkata"
overall_location = extract_location(address)
print(overall_location)


Park Street Area, Kolkata


## Import table:

In [21]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'Zkolkata.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)


### Visualize
Visualize the top 5 entries

In [22]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun)
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun)
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri..."
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)"
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun)


In [23]:
# Example: Get unique values of a specific column
unique_values_column = df['name'].unique()
unique_values_column[5:20]

array(['KFC', 'Tung Fong', 'Arsalan', 'Kusum Rolls', "Kareem's",
       'WOW! China', 'Golden Spoon', 'BarBQ', 'The Kebab Factory',
       'Patisserie By Franziska', 'The Sixth Sense', 'Food King',
       'Burgrill', 'Zainab Biryani', 'The Crepe Cafe'], dtype=object)

In [65]:
# Apply extract location in top 100 rows
address = df['address']
df['Location'] = address[0:100].apply(extract_location)

In [None]:
# Example: Get unique values of a specific column
unique_values_column = df['name'].unique()
unique_values_column[5:20]

In [63]:
# Convert 'address' column to strings
df['address'] = df['address'].astype(str)

### Adding another column called Location
Adding another column called location which will contain the area not exact address of the restaurent

In [66]:
# Specify the chunk size
chunk_size = 100

# Create an empty 'Location' column
df['Location'] = ''

# Loop through chunks
for i in range(0, len(df['address']), chunk_size):
    chunk_addresses = df['address'].iloc[i:i + chunk_size]
    
    # Apply the extract_location function to the chunk and assign values to the 'Location' column
    df.loc[i:i + chunk_size - 1, 'Location'] = chunk_addresses.apply(extract_location)

In [76]:
# Visualize data
starting_index = 1000
df[starting_index:starting_index+5]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
1000,The Anchorage Bar - The Floatel Hotel,53 votes,3.3,"9/10, Floatel, Kolkata Jetty, Strand Road, Dal...","North Indian, Mughlai, Chinese","₹2,000",1pm – 11pm (Mon-Sun),"Dalhousie BBD Bagh, Kolkata"
1001,All Day Dinning,Cuisines:,NEW,"20B, Paddapukur Road, Bhawanipur, Kolkata","North Indian, Mughlai, Chinese",₹900,8:30am – 11pm (Mon-Sun),"Bhawanipur, Kolkata"
1002,Cakerica,12 votes,3.3,"37A, Justice Chandra Madhab Road, Bhawanipur, ...",South Indian,₹100,9am – 9pm (Mon-Sun),"Bhawanipur, Kolkata"
1003,Mughal Garden Restaurant,204 votes,3.8,"10, Kiran Shankar Roy Road, Near Millenium Par...",South Indian,₹400,11am – 11pm (Mon-Sun),"Dalhousie BBD Bagh, Kolkata"
1004,The Bengal Ghorana,Cuisines:,-,"1/3A, Rammoy Road, Near Health Point Nursing H...","Rolls, Chinese","₹2,000",24 Hours (Mon-Sun),"Bhowanipore, Kolkata"


## Analyzing rating column and cleaning
Analyzing the rating column and replacing the null entries

In [78]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_ratings = df['rating'].unique()

# Display the unique ratings
print("Unique Ratings:", unique_ratings)

Unique Ratings: ['4.2' '4.9' '4.4' '4.6' '4.1' '4.3' '4.0' '3.8' '3.6' 'NEW' nan '3.3'
 '3.7' '3.9' '3.5' '3.2' '3.1' '3.0' '2.9' '-' '3.4' '2.3' '4.5' '2.8'
 '4.7' '2.7' '2.6' '2.5' '2.1' '4.8' '2.4' '2.2' '1.9' '2.0']


Replace `NEW` and `-` ratings with `0` and `nan` with `-1`

In [86]:
# Assuming df is your DataFrame
# Replace 'NEW' with '0' and 'nan' with '-1' in the entire DataFrame
df.replace({'NEW': '0', np.nan: '-1', '-': '0'}, inplace=True)

# Now, the 'ratings' column should only contain numeric strings, and you can proceed with sorting
# Assuming 'ratings' is the column containing ratings
df['ratings'] = df['ratings'].astype(float)



In [88]:
# Sort the DataFrame based on the 'ratings' column
df.sort_values(by='ratings', inplace=True)

# Display the DataFrame
df

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,ratings
3957,Club Fenicia,Cuisines:,-1,"Godrej Waterside, Tower- I, 10th Floor, DP Blo...","North Indian, Beverages","₹2,000",-1,"Salt Lake, Kolkata",-1.0
2238,Sizzler,Cuisines:,-1,"Pouro Prashan Vaban, Sector 3, Salt Lake, Kolkata","Bengali, Bangladeshi, Seafood",₹300,-1,"Salt Lake, Kolkata",-1.0
4425,Hunger's Cafe,Cuisines:,-1,"237/B, S.D Chatterjee Road, Baruipur, Narendra...","Desserts, Beverages, North Indian",₹300,-1,"Narendra Pur, Kolkata",-1.0
3522,Haji Food Centre,Cuisines:,-1,"24, Sahapur Main Road, Alipore, Kolkata","Pizza, Fast Food",₹200,-1,"Alipore, Kolkata",-1.0
1061,Chinese Flavours,Cuisines:,-1,"14/H/6, Bibi Bagan Lane, Entally, Kolkata",Fast Food,₹350,-1,"Entally, Kolkata",-1.0
...,...,...,...,...,...,...,...,...,...
3973,Blu Orchid,718 votes,4.8,"C22, Sammilani Park, Survey Park Area, Ajoy Na...",Bakery,₹700,12noon – 11pm (Mon-Sun),"Ajoy Nagar, Kolkata",4.8
3958,Urban Masala,494 votes,4.8,"882, Lake Town Road, Near Jaya Cinema, Lake To...",Mishti,₹600,"8am – 10:30pm (Mon-Fri),8am – 11:30pm (Sat-Sun)","Lake Town, Kolkata",4.8
1109,Kopai,702 votes,4.8,"213, Sarat Bose Road, Near Post Office, Southe...",Tea,₹700,11:30am – 10pm (Mon-Sun),"Southern Avenue, Kolkata",4.8
1221,Girish Chandra Dey & Nakur Chandra Nandy,1586 votes,4.8,"56, Ramdulal Sarkar Street, Hedua Park, Hatiba...","Bakery, Desserts",₹100,7am – 10:30pm (Mon-Sun),"Hatibagan, Kolkata",4.8


In [98]:
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,ratings
5973,Radha Gobinda Mistanna,36 votes,3.4,"Shop 20, 2 Bazar, Diagonal Road, B7, Block B, ...",Mishti,₹100,7am – 12midnight (Mon-Sun),"Kalyani, Kolkata",3.4
831,Haldiram & Sons Bhujiawala,44 votes,3.4,"62, Bentinck Street, Bow Bazar, Kolkata","North Indian, Chinese, Continental, Italian",₹350,8:30am – 9:30pm (Mon-Sun),"Bow Bazar, Kolkata",3.4
6256,KG4,115 votes,3.4,"422, Sudha Apartments, Garia Garden, Garia, Ko...",Chinese,₹450,12noon – 10:30pm (Mon-Sun),"Garia, Kolkata",3.4
3391,Anaya Kitchen,32 votes,3.4,"3/1, Gopal Mukherjee Road, Paikpara, Kolkata","North Indian, Chinese, Rolls, Fast Food",₹300,12noon – 12midnight (Mon-Sun),"Paikpara, Kolkata",3.4
1535,Sonar Bangla,45 votes,3.4,"7, Andul Raj Road, Hazra, Kolkata","Fast Food, Burger, American, Rolls, Wraps",₹250,12noon – 10pm (Mon-Sun),"Hazra, Kolkata",3.4


## Analyzing unique locations

In [95]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_loc = df['Location'].unique()

# Display the unique ratings
print("Unique Locations:", unique_loc)

Unique Locations: ['Salt Lake, Kolkata' 'Narendra Pur, Kolkata' 'Alipore, Kolkata'
 'Entally, Kolkata' 'Shyam Bazar, Kolkata' 'Topsia, Kolkata'
 'GT Road, Howrah' 'New Town, Kolkata' 'Tollygunge, Kolkata'
 'Baguihati, Kolkata' 'Kestopur, Kolkata' 'Tara Tala, Kolkata'
 'Beliaghata, Kolkata' 'Jodhpur Park, Kolkata' 'Paikpara, Kolkata'
 'Gariahat, Kolkata' 'Ruby Hospital Area, Kolkata' 'Kalyani, Kolkata'
 'Kona Exp Way, Howrah' 'Sonarpur, Kolkata' 'College Street, Kolkata'
 'Tangra, Kolkata' 'Bhawanipur, Kolkata' 'Near Tollygunge, Kolkata'
 'Kalikapur, Kolkata' 'Ballygunge, Kolkata' 'Dum Dum, Kolkata'
 'Kaikhali, Kolkata' 'Kasba, Kolkata' 'Baranagar, Kolkata'
 'Sinthi, Kolkata' 'Chandni Chowk, Kolkata' 'Delivery Only'
 'Dobson Road, Howrah' 'Southern Avenue, Kolkata' 'Minto Park, Kolkata'
 'Barrackpore, Kolkata' 'Behala, Kolkata' 'Maniktala, Kolkata'
 'Elgin, Kolkata' 'Park Circus Area, Kolkata' 'Near Thakur Pukur, Kolkata'
 'Picnic Garden, Kolkata' 'Garia, Kolkata' 'Lake Town, Kolkata'
 

### Splitting Location
Column location needs to be split into two parts City and location

I have created a helper function called `extract_part_of_loc` it can be used to extract parts of location