# Data preparation for Restaurant AI

This data is provided by kaggle. Zomato 2022 Kolkata data

In [1]:
import pandas as pd
import numpy as np

### Function to extract location:

In [1]:
def extract_parts_of_loc(address, index):
    """
    Extracts parts from location

    Args:
        address (string): Takes in the whole string address
        index (int): The index to determine which part of the address to extract.

    Returns:
        str: The extracted location part of the address.

    Example: 
        address = "1, Middleton Row, Near Loreto House, Park Street Area, Kolkata"
        
        overall_location = extract_location(address,-1)
        -> Kolkata

        overall_location = extract_location(address,-2)
        ->Park Street Area
    """
    # Split the address into parts
    address_parts = address.split(', ')

    #print(address_parts)

    if index==-1:
        loc = "".join(address_parts[index:])
    else:
        # Identify the relevant part (e.g., the last two elements)
        loc = "".join(address_parts[index:index+1])
    
    return loc

In [2]:
def extract_location(address):
    # Split the address into parts
    address_parts = address.split(', ')
    
    # Identify the relevant part (e.g., the last two elements)
    relevant_part = ', '.join(address_parts[-2:])
    
    return relevant_part

# Example usage
address = "1, Middleton Row, Near Loreto House, Park Street Area, Kolkata"
overall_location = extract_location(address)
print(overall_location)


Park Street Area, Kolkata


## Import table:

In [4]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'Zkolkata.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)


### Visualize
Visualize the top 5 entries

In [5]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun)
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun)
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri..."
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)"
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun)


In [6]:
# Example: Get unique values of a specific column
unique_values_column = df['name'].unique()
unique_values_column[5:20]

array(['KFC', 'Tung Fong', 'Arsalan', 'Kusum Rolls', "Kareem's",
       'WOW! China', 'Golden Spoon', 'BarBQ', 'The Kebab Factory',
       'Patisserie By Franziska', 'The Sixth Sense', 'Food King',
       'Burgrill', 'Zainab Biryani', 'The Crepe Cafe'], dtype=object)

In [7]:
# Apply extract location in top 100 rows
address = df['address']
df['Location'] = address[0:100].apply(extract_location)

In [8]:
# Example: Get unique values of a specific column
unique_values_column = df['name'].unique()
unique_values_column[5:20]

array(['KFC', 'Tung Fong', 'Arsalan', 'Kusum Rolls', "Kareem's",
       'WOW! China', 'Golden Spoon', 'BarBQ', 'The Kebab Factory',
       'Patisserie By Franziska', 'The Sixth Sense', 'Food King',
       'Burgrill', 'Zainab Biryani', 'The Crepe Cafe'], dtype=object)

In [9]:
# Convert 'address' column to strings
df['address'] = df['address'].astype(str)

### Adding another column called Location
Adding another column called location which will contain the area not exact address of the restaurent

In [10]:
# Specify the chunk size
chunk_size = 100

# Create an empty 'Location' column
df['Location'] = ''

# Loop through chunks
for i in range(0, len(df['address']), chunk_size):
    chunk_addresses = df['address'].iloc[i:i + chunk_size]
    
    # Apply the extract_location function to the chunk and assign values to the 'Location' column
    df.loc[i:i + chunk_size - 1, 'Location'] = chunk_addresses.apply(extract_location)

In [11]:
# Visualize data
starting_index = 1000
df[starting_index:starting_index+5]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
1000,The Anchorage Bar - The Floatel Hotel,53 votes,3.3,"9/10, Floatel, Kolkata Jetty, Strand Road, Dal...","North Indian, Mughlai, Chinese","₹2,000",1pm – 11pm (Mon-Sun),"Dalhousie BBD Bagh, Kolkata"
1001,All Day Dinning,Cuisines:,NEW,"20B, Paddapukur Road, Bhawanipur, Kolkata","North Indian, Mughlai, Chinese",₹900,8:30am – 11pm (Mon-Sun),"Bhawanipur, Kolkata"
1002,Cakerica,12 votes,3.3,"37A, Justice Chandra Madhab Road, Bhawanipur, ...",South Indian,₹100,9am – 9pm (Mon-Sun),"Bhawanipur, Kolkata"
1003,Mughal Garden Restaurant,204 votes,3.8,"10, Kiran Shankar Roy Road, Near Millenium Par...",South Indian,₹400,11am – 11pm (Mon-Sun),"Dalhousie BBD Bagh, Kolkata"
1004,The Bengal Ghorana,Cuisines:,-,"1/3A, Rammoy Road, Near Health Point Nursing H...","Rolls, Chinese","₹2,000",24 Hours (Mon-Sun),"Bhowanipore, Kolkata"


## Analyzing rating column and cleaning
Analyzing the rating column and replacing the null entries

In [12]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_ratings = df['rating'].unique()

# Display the unique ratings
print("Unique Ratings:", unique_ratings)

Unique Ratings: ['4.2' '4.9' '4.4' '4.6' '4.1' '4.3' '4.0' '3.8' '3.6' 'NEW' nan '3.3'
 '3.7' '3.9' '3.5' '3.2' '3.1' '3.0' '2.9' '-' '3.4' '2.3' '4.5' '2.8'
 '4.7' '2.7' '2.6' '2.5' '2.1' '4.8' '2.4' '2.2' '1.9' '2.0']


Replace `NEW` and `-` ratings with `0` and `nan` with `-1`

In [13]:
# Assuming df is your DataFrame
# Replace 'NEW' with '0' and 'nan' with '-1' in the entire DataFrame
df.replace({'NEW': '0', np.nan: '-1', '-': '0'}, inplace=True)

# Now, the 'ratings' column should only contain numeric strings, and you can proceed with sorting
# Assuming 'ratings' is the column containing ratings
df['rating'] = df['rating'].astype(float)



In [14]:
# Sort the DataFrame based on the 'ratings' column
#df.sort_values(by='rating', inplace=True)

# Display the DataFrame
df

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun),"Park Street Area, Kolkata"
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun),"Park Street Area, Kolkata"
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...","Park Street Area, Kolkata"
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)","Park Street Area, Kolkata"
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun),"Park Street Area, Kolkata"
...,...,...,...,...,...,...,...,...
7383,WOW! ARBA,Cuisines:,0.0,Delivery Only,"Chinese, Fast Food",₹150,11am – 11pm (Mon-Sun),Delivery Only
7384,Renu's,50 votes,2.8,"89/109, Vivekananda Road, Bangur Park, Hooghly...",Bengali,₹350,11am – 11pm (Mon-Sun),"Rishra, Kolkata"
7385,The Food Place,Cuisines:,0.0,"119, Ramkrishna Road, Radhashree, Landmark- Ai...",Bengali,₹150,8am – 11:45pm (Mon-Sun),"Dum Dum, Kolkata"
7386,New Dalma Restaurant,Cuisines:,0.0,Delivery Only,Chinese,₹250,12noon – 10:30pm (Mon-Sun),Delivery Only


In [15]:
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location
5000,Al Zaffran,88 votes,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",₹500,11:30am – 10:30pm (Mon-Sun),"Tara Tala, Kolkata"
5001,The Kitchen,420 votes,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",₹900,12noon – 11pm (Mon-Sun),"Lake Town, Kolkata"
5002,Foodizm,149 votes,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",₹500,12noon – 10pm (Mon-Sun),"Baranagar, Kolkata"
5003,Mehek-E-Dawat,117 votes,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",₹400,12noon – 10:30pm (Mon-Sun),"Behala, Kolkata"
5004,Hot Spot,38 votes,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,₹300,12:30pm – 10pm (Mon-Sun),"Dum Dum, Kolkata"


## Analyzing unique locations

In [16]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_loc = df['Location'].unique()

# Display the unique ratings
print("Unique Locations:", unique_loc)

Unique Locations: ['Park Street Area, Kolkata' 'New Market Area, Kolkata'
 'Russel Street Area, Kolkata' 'Ripon Street, Kolkata' 'Delivery Only'
 ' Park Street Area, Kolkata' 'Near Park Circus Area, Kolkata'
 'Mirza Ghalib Street Park Street Area, Kolkata'
 'Park Circus Area, Kolkata' 'Wellesley, Kolkata'
 'Camac Street Area, Kolkata' 'Elgin, Kolkata' 'Entally, Kolkata'
 'Loudon Street Area, Kolkata' 'Theatre Road, Kolkata'
 'Esplanade, Kolkata' 'Minto Park, Kolkata' 'Chowringhee, Kolkata'
 'Taltala, Kolkata' '43B ripon street Kolkata'
 'Beck Bagan Park Circus Area, Kolkata'
 'IDBI Bank Building, New Market Area Kolkata' 'Near Elgin, Kolkata'
 'Camac Street, Kolkata' '18, Hemanta Basu Sarani kolkata' 'nan'
 'Nandalal Basu Sarani, Kolkata' 'Lord Sinha Road Elgin, Kolkata'
 '\n Taltala, Kolkata' 'Circus Avenue Park Circus Area, Kolkata'
 'Near Entally, Kolkata' 'Ballygunge, Kolkata' 'Bhawanipur, Kolkata'
 'Paddapukur, Kolkata' 'Sealdah Area, Kolkata' 'Chandni Chowk, Kolkata'
 'Bara Bazar

### Splitting Location
Column location needs to be split into two parts City and location

I have created a helper function called `extract_part_of_loc` it can be used to extract parts of location

In [17]:
# Update the 'City' column with values
for i in range(len(df)):
    df.loc[i, 'City'] = extract_parts_of_loc(df.loc[i, 'address'], -1)

In [18]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
0,Peter Cat,12404 votes,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental","₹1,000",11am – 11:20pm (Mon-Sun),"Park Street Area, Kolkata",Kolkata
1,Naturals Ice Cream,2498 votes,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",₹200,11am – 12midnight (Mon-Sun),"Park Street Area, Kolkata",Kolkata
2,Carpe Diem,4083 votes,4.4,"18M, Park Street Area, Kolkata",Ice Cream,"₹1,000","12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...","Park Street Area, Kolkata",Kolkata
3,Barbeque Nation,5455 votes,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,"₹1,800","12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)","Park Street Area, Kolkata",Kolkata
4,Flurys,4709 votes,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",₹400,7:30am – 11pm (Mon-Sun),"Park Street Area, Kolkata",Kolkata


Removing City names after commas from Location column

In [19]:
# Update the 'Location' column with values
for i in range(len(df)):
    df.loc[i, 'Location'] = extract_parts_of_loc(df.loc[i, 'address'], -2)

In [20]:
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
5000,Al Zaffran,88 votes,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",₹500,11:30am – 10:30pm (Mon-Sun),Tara Tala,Kolkata
5001,The Kitchen,420 votes,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",₹900,12noon – 11pm (Mon-Sun),Lake Town,Kolkata
5002,Foodizm,149 votes,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",₹500,12noon – 10pm (Mon-Sun),Baranagar,Kolkata
5003,Mehek-E-Dawat,117 votes,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",₹400,12noon – 10:30pm (Mon-Sun),Behala,Kolkata
5004,Hot Spot,38 votes,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,₹300,12:30pm – 10pm (Mon-Sun),Dum Dum,Kolkata


### VoteCount field
Cleaning the unneccesary stuff present in voteCount column

In [21]:
# Remove 'votes' from the 'voteCount' column
df['voteCount'] = df['voteCount'].str.replace(' votes', '')

# Display the DataFrame
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
5000,Al Zaffran,88,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",₹500,11:30am – 10:30pm (Mon-Sun),Tara Tala,Kolkata
5001,The Kitchen,420,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",₹900,12noon – 11pm (Mon-Sun),Lake Town,Kolkata
5002,Foodizm,149,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",₹500,12noon – 10pm (Mon-Sun),Baranagar,Kolkata
5003,Mehek-E-Dawat,117,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",₹400,12noon – 10:30pm (Mon-Sun),Behala,Kolkata
5004,Hot Spot,38,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,₹300,12:30pm – 10pm (Mon-Sun),Dum Dum,Kolkata


In [22]:
# Remove '₹' from the 'cost' column
df['cost'] = df['cost'].str.replace('₹', '')

# Display the DataFrame
df[5000:5005]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
5000,Al Zaffran,88,3.3,"D/1/5, Santoshpur Station Road, Tara Tala, Kol...","Chinese, Mughlai",500,11:30am – 10:30pm (Mon-Sun),Tara Tala,Kolkata
5001,The Kitchen,420,4.0,"P-188, Block-B, Lake Town, Kolkata","Chinese, Mughlai",900,12noon – 11pm (Mon-Sun),Lake Town,Kolkata
5002,Foodizm,149,3.5,"262, B.T Road, Tobin Road Crossing, Baranagar,...","South Indian, Street Food",500,12noon – 10pm (Mon-Sun),Baranagar,Kolkata
5003,Mehek-E-Dawat,117,3.1,"59B, Becharam Chatterjee Road, State Bank Colo...","South Indian, Street Food",400,12noon – 10:30pm (Mon-Sun),Behala,Kolkata
5004,Hot Spot,38,3.3,"Dum Dum Park Bazar, Near Mother Dairy, Dum Dum...",Chinese,300,12:30pm – 10pm (Mon-Sun),Dum Dum,Kolkata


In [23]:
# Assuming df is your processed DataFrame
#df.to_csv('Zomato_processed_data.csv', index=False)

In [24]:
# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_loc = df['Location'].unique()
len(unique_loc)

# Assuming df is your DataFrame and 'ratings' is the column containing ratings
unique_add = df['address'].unique()
len(unique_loc), len(unique_add)

(294, 6927)

In [25]:
df

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata
...,...,...,...,...,...,...,...,...,...
7383,WOW! ARBA,Cuisines:,0.0,Delivery Only,"Chinese, Fast Food",150,11am – 11pm (Mon-Sun),,Delivery Only
7384,Renu's,50,2.8,"89/109, Vivekananda Road, Bangur Park, Hooghly...",Bengali,350,11am – 11pm (Mon-Sun),Rishra,Kolkata
7385,The Food Place,Cuisines:,0.0,"119, Ramkrishna Road, Radhashree, Landmark- Ai...",Bengali,150,8am – 11:45pm (Mon-Sun),Dum Dum,Kolkata
7386,New Dalma Restaurant,Cuisines:,0.0,Delivery Only,Chinese,250,12noon – 10:30pm (Mon-Sun),,Delivery Only


In [79]:
# Specify the path to your CSV file
file_path = 'Zomato_processed_data.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [80]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata


In [26]:
# import re

# def extract_parts_of_timing(timing, part):
#     """
#     Extracts parts from timing

#     Args:
#         timing (string): Takes in the whole timing string
#         part (str): 'time' or 'days', specifies which part to extract.

#     Returns:
#         str: The extracted time or days part.

#     Example:
#         timing = "11am – 11:20pm (Mon-Sun)"
        
#         time_part = extract_parts_of_timing(timing, 'time')
#         -> "11am – 11:20pm"

#         days_part = extract_parts_of_timing(timing, 'days')
#         -> "Mon-Sun"
#     """
#     # Extracting time
#     if part == 'time':
#         time_match = re.search(r'\d+:\d+\w{0,2}\s*–\s*\d+:\d+\w{0,2}', timing)
#         return time_match.group() if time_match else None
#     # Extracting days
#     elif part == 'days':
#         days_match = re.search(r'\((.*?)\)', timing)
#         return days_match.group(1) if days_match else None
#     else:
#         return None




In [27]:
# # Applying the function to create new columns
# df['time'] = df['timing'].apply(lambda x: extract_parts_of_timing(x, 'time'))
# df['Days_open'] = df['timing'].apply(lambda x: extract_parts_of_timing(x, 'days'))

# # Drop the original 'timing' column if needed
# # df = df.drop('timing', axis=1)

# # Display the modified DataFrame
# #print(df[['name', 'time', 'Days_open']])

In [81]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata


In [82]:
import re

def extract_days_open(timing):
    """
    Extracts content inside simple brackets and returns it.

    Args:
        timing (string): Takes in the whole timing string.

    Returns:
        str: The extracted content inside simple brackets.

    Example:
        timing = "11am – 11:20pm (Mon-Sun)"
        
        days_open = extract_days_open(timing)
        -> "Mon-Sun"
    """
    # Try to match content inside simple brackets
    days_open_match = re.search(r'\((.*?)\)', timing)
    return days_open_match.group(1) if days_open_match else None




In [83]:
# Applying the function to create the 'Days_Open' column
df['Days_Open'] = df['timing'].apply(extract_days_open)

# Display the modified DataFrame
df.head(20)

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City,Days_Open
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata,
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata,Mon-Sun
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
5,KFC,2135,4.1,"20K, Park Street, Park Street Area, Kolkata","Chinese, North Indian, Continental",450,11am – 11pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
6,Tung Fong,3644,4.3,"25 B, Ground Floor, Karnani Mansion, Free Scho...","North Indian, Chinese, Kebab, BBQ",1000,"12noon – 4pm, 6pm – 11pm (Mon-Sun)",Park Street Area,Kolkata,Mon-Sun
7,Arsalan,1500,4.0,"119 A, Muzaffar Ahmed Street, Mirza Ghalib Str...","North Indian, Chinese, Kebab, BBQ",1000,11am – 11:45pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
8,Kusum Rolls,1973,4.1,"21, Karnani Mansion, Park Street Area, Kolkata","Bakery, Desserts, Sandwich",300,12noon – 11pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
9,Kareem's,1027,4.3,"55 B, Mirza Ghalib Street, Park Street Area, K...","Bakery, Desserts, Sandwich",1200,"12:30pm – 4:30pm, 6:30pm – 11:30pm (Mon-Sun)",Park Street Area,Kolkata,Mon-Sun


In [84]:
filtered_rows = df[(df['Days_Open'].isnull()) & (df['timing'] != '-1')][['name', 'Days_Open', 'timing']]
print(filtered_rows)

len(df[(df['Days_Open'].isnull()) & (df['timing'] != '-1')][['name','Days_Open', 'timing']])

                                         name Days_Open  \
2                                  Carpe Diem      None   
169                         The Chicken House      None   
182                                    Zaranj      None   
298                    Olive - Hotel Niharika      None   
790                                  Tung Nam      None   
899              7/12 Fried Ice Cream Parlour      None   
916                                  Chin Wah      None   
1079  ITC Royal Bengal- Grand Market Pavilion      None   
1087                              Level Seven      None   
1570                        New Royal Biryani      None   
1578                            Keyar Henshel      None   
1743                           Rajendra Hotel      None   
1818                          Zeehan Cuisines      None   
2321                      Jawed Biryani House      None   
2457                            Tawa Tamancha      None   
2477                                Swadkahon      None 

36

As we can see there are 36 entries whose dates not added so we need to add them manually

### Date schedule of restaurants
* 790 - Tue closed
* 916 - Tue closed
* 1578 - null
* 1743 - null
* 1818 - null
* 2477 - null
* 2966 - Thu closed
* 3964, 4001, 4026 - null
* 5417 - Sun closed
* 6058 - Wed Closed
* 6528 - null


Rest open everyday

In [88]:
# Setting all the columns Mon-Sun in 36 columns
df.loc[df['Days_Open'].isnull(), 'Days_Open'] = 'Mon-Sun'
df[789:791]

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City,Days_Open
789,Anand Restaurant,1343,4.1,"19, C.R. Avenue, Near Dalhousie Square Area, C...","North Indian, Chinese, Bengali",500,"11am – 10pm (Mon, Tue, Thu, Fri, Sat), Closed...",Chandni Chowk,Kolkata,"Mon, Tue, Thu, Fri, Sat"
790,Tung Nam,1564,4.3,"24, Chattwala Gully, Near Terreti Bazar, Bara ...","Chinese, Fast Food",600,"12noon – 3pm, 6pm – 10pm (Mon, Wed, Thu, Fri...",Bara Bazar,Kolkata,Mon-Sun


Setting the not known fields to Mon-Sun 

In [91]:
df.at[2, 'Days_Open'] = 'Mon-Sun'
df.at[790, 'Days_Open'] = 'Mon, Wed, Thu, Fri, Sat, Sun'
df.at[916, 'Days_Open'] = 'Mon, Wed, Thu, Fri, Sat, Sun'
# df.at[1578, 'Days_Open'] = ''
# df.at[1743, 'Days_Open'] = ''
# df.at[1818, 'Days_Open'] = ''
# df.at[2477, 'Days_Open'] = ''
df.at[2966, 'Days_Open'] = 'Mon,Tue, Wed, Fri, Sat, Sun'
df.at[56, 'Days_Open'] = 'Tue, Wed,Thu, Fri, Sat, Sun'
# df.at[3964, 'Days_Open'] = ''
# df.at[4001, 'Days_Open'] = ''
# df.at[4026, 'Days_Open'] = ''
df.at[5417, 'Days_Open'] = 'Mon, Tue, Wed, Thu, Fri, Sat'
df.at[6058, 'Days_Open'] = 'Mon, Tue, Thu, Fri, Sat, Sun'
# df.at[6528, 'Days_Open'] = ''
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City,Days_Open
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata,Mon-Sun
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata,Mon-Sun
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun


In the above database there are issues like if two components are encountered then second one is ignored e.g. (mon-Sat) and (sun) then sun is ignored

In [92]:
#df.to_csv('Zomato_processed_data.csv', index=False)

In [1]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'Zomato_processed_data.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [2]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City,Days_Open
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata,Mon-Sun
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata,Mon-Sun
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun


In [4]:
len(df['Days_Open'].value_counts())

32

In [97]:
#Exporting time column to .csv for better analysis
#df['timing'].to_csv('timing.txt', header=False, index=False)


In [103]:
import pandas as pd

# Assuming your DataFrame is named 'df'
filtered_df = df[(df['timing'].str.contains('(Mon-Sun)')==False) & (df['timing'] != '-1')]

print(filtered_df)
len(filtered_df)


                 name  voteCount  rating  \
2          Carpe Diem       4083     4.4   
15    The Sixth Sense         92     4.0   
32        Roll It Out         80     3.9   
54      Golden Dragon       1287     3.9   
65            Rollick        411     4.2   
...               ...        ...     ...   
7270     Home Kitchen  Cuisines:     0.0   
7285      Food Heaven          5     3.0   
7289      What a Roll         96     2.9   
7292    Shawarma City        117     2.9   
7294              KIK  Cuisines:     0.0   

                                                address  \
2                        18M, Park Street Area, Kolkata   
15    25B, Karnani Mansion, Opposite Westside, Park ...   
32             2, Middle Row, Park Street Area, Kolkata   
54     40A, 57, Park Mansion, Park Street Area, Kolkata   
65         1A, Russel Street, Park Street Area, Kolkata   
...                                                 ...   
7270  3rd Floor, Block A, Debojyoti Dham, Chandangar...   

  filtered_df = df[(df['timing'].str.contains('(Mon-Sun)')==False) & (df['timing'] != '-1')]


565

In [104]:
#filtered_df['timing'].to_csv('timing_filtered.txt', header=False, index=False)

In [3]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'Zomato_processed_data.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,name,voteCount,rating,address,cusine,cost,timing,Location,City,Days_Open
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000,11am – 11:20pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200,11am – 12midnight (Mon-Sun),Park Street Area,Kolkata,Mon-Sun
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri...",Park Street Area,Kolkata,Mon-Sun
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)",Park Street Area,Kolkata,Mon-Sun
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400,7:30am – 11pm (Mon-Sun),Park Street Area,Kolkata,Mon-Sun


In [11]:
top_loc = df['Location'].value_counts()
top_loc.tail(20), len(top_loc)

(Location
 Salt Lake City                                       1
 Near Ballygunge                                      1
 Beliaghata Road                                      1
 Near Shibpur                                         1
 Near Howrah Maidan Area                              1
 Bandhaghat                                           1
 Ashoknagar                                           1
 Near Laxmi Narayan Mandir                            1
 375 Prince Anwar Shah Road                           1
 530A Jodhpur Park                                    1
 Near Kalikapur                                       1
 Mechua Bazar                                         1
  Kankurgachi                                         1
 Near Girish Park                                     1
 Deshapriya Park                                      1
 Beliaghata Main Road Beliaghata                      1
 Das Naagar                                           1
 WARD NO 068                          

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7388 entries, 0 to 7387
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       7388 non-null   object 
 1   voteCount  7388 non-null   object 
 2   rating     7388 non-null   float64
 3   address    7387 non-null   object 
 4   cusine     7388 non-null   object 
 5   cost       7388 non-null   object 
 6   timing     7388 non-null   object 
 7   Location   7188 non-null   object 
 8   City       7387 non-null   object 
 9   Days_Open  7388 non-null   object 
dtypes: float64(1), object(9)
memory usage: 577.3+ KB


In [13]:
len(df)

7388