# 1. We will use Api provided by 'https://geokeo.com/' to scrape coordinates of different city in india

# 2.  Because we are using the free token, we can only send 2500 requests (2500 locations) per day.

# 3. To solve this problem i will create one dataframe that will keep all unique locations with their scraped coordinates. those location which are already got their longitude and latitude will be excluded from next loop

# 4. lets see how many unique locations we have in our dataset 

In [1]:
import requests
import os
import csv
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"D:\flipkart reviews\all csv combine\processed_reviews.csv")
df.head()

Unnamed: 0,prod_id,product_name,brand_name,category,price,sold,prod_url,customer_name,purchase_date,customers_city,rating,comment_head,comment,purchase_month,purchase_year,short_name
0,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,2023-01-01,Karimpur,5,Great product,Very good product 🙂🎈🎈🎈🎈🎈,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...
1,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,2023-01-01,Ranchi,4,Really Nice,nice,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...
2,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Dhiraj Jaiswal,2023-01-01,Sidhi,3,Does the job,Good,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...
3,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Vasamsetti Durgayya,2023-01-01,Hyderabad,5,Best in the market!,Good product at this price.i am very happy.per...,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...
4,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,2023-01-01,Bengaluru,1,Did not meet expectations,Worst laptop don't buy this laptop,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...


In [3]:
df[['customers_city']]

Unnamed: 0,customers_city
0,Karimpur
1,Ranchi
2,Sidhi
3,Hyderabad
4,Bengaluru
...,...
917572,Pileru
917573,Hosur
917574,Ernakulam
917575,Hoshiarpur


In [4]:
len(df['customers_city'].unique())

8986

### 4.1. Before we scrape locations of our customers, we need to add ', india' infront of all locations because flipkart only delivers to india. Otherwise there is possiblity of scraping of international locations of same names.

In [5]:
def add_india(loc):
    new_loc = str(loc).strip() + ', India'
    return new_loc

In [6]:
df['customers_city'] = df['customers_city'].map(add_india)

In [7]:
df[['customers_city']]

Unnamed: 0,customers_city
0,"Karimpur, India"
1,"Ranchi, India"
2,"Sidhi, India"
3,"Hyderabad, India"
4,"Bengaluru, India"
...,...
917572,"Pileru, India"
917573,"Hosur, India"
917574,"Ernakulam, India"
917575,"Hoshiarpur, India"


In [8]:
len(df['customers_city'].unique())

8986

# 5. making the code future proof so i can send 2500 requests everyday, without repeating the code for same locations that i have already scraped

### 5.1. Creating the seperate empty csv if not exists 'coordinates_data.csv' in hdd.

In [9]:
directory = r"D:\flipkart reviews\coordinates"
file_name = 'coordinates_data.csv'
file_path = os.path.join(directory, file_name)

In [10]:
if not os.path.exists(directory):
    os.makedirs(directory)
if not os.path.exists(file_path):  
    df_loc = pd.DataFrame(columns=['customers_city','address','latitude', 'longitude'])
    df_loc.to_csv(file_path, index=False)

### 5.2. Creating function to take customer location as argument and it will scrape 'latitude' & 'longitude' of that location.

In [11]:
def find_coordinates(customers_city):
    url = f'https://geokeo.com/geocode/v1/search.php?q={customers_city}&api=ca860c37d0eb65b9a0d288575d508871'
    resp = requests.get(url=url)
    data = resp.json()
    if 'status' in data:
        if data['status']=='ok':
            address=data['results'][0]['formatted_address']
            latitude=data['results'][0]['geometry']['location']['lat']
            longitude=data['results'][0]['geometry']['location']['lng']    
            return f"{address}---{latitude}---{longitude}"

### 5.3. Creating function that can scan 'coordinates_data.csv' and if our newly given locations are already in it, our function will not send request to api. It will save our free tokens, which are only 2500 requests per day. If given location is not in 'coordinates_data.csv' then only function will send request to api.

In [12]:
def find_lat_lng_if_not_exists(locations):
    for location in locations:
        coordinates_data = pd.read_csv(file_path)
        if location not in coordinates_data['customers_city'].values:
            try:
                coordinates = find_coordinates(location)
                coordinates = coordinates.split('---')
                address = str(coordinates[0])
                latitude = float(coordinates[1])
                longitude = float(coordinates[2])
            except AttributeError:
                address = np.nan
                latitude = np.nan
                longitude = np.nan
            except ValueError:
                address = np.nan
                latitude = np.nan
                longitude = np.nan
                
            coordinates_data.loc[len(coordinates_data)] = [location, address, latitude, longitude]
            coordinates_data.to_csv(file_path, index=False)
            request_no = len(coordinates_data)
            print(f"request_no:{request_no} | customers city:{location} |  latitude:{latitude} | longitude:{longitude}")
        else:
            pass
        
        
    coordinates_data = pd.read_csv(file_path)
    total_loc = len(coordinates_data)
    print(f"All locations from processed_reviews.csv are already inside coordinates_data.csv | Total scraped locations: {total_loc}")

### 5.4. Lets test our code

In [111]:
# Reading our location data file

coordinates_data = pd.read_csv(file_path)
coordinates_data.head()

Unnamed: 0,customers_city,address,latitude,longitude


In [112]:
# Here i have intensionally inserted duplicated locations

test = ['Bengaluru, India','Hyderabad, India','Hyderabad, India','Pune, India','Pune, India']

In [115]:
# inserting list in function as argument

find_lat_lng_if_not_exists(test)

customers city: Bengaluru, India | address: Bengaluru,Karnataka,India | latitude: 12.97881363920382 | longitude: 77.60355665740477
customers city: Hyderabad, India | address: Hyderabad,Bahadurpura mandal,Telangana,500 002,India | latitude: 17.36058900017266 | longitude: 78.4740613
customers city: Pune, India | address: Pune City,Pune District,Maharashtra,411001,India | latitude: 18.52837142075631 | longitude: 73.8763633437941


In [116]:
# Reading our location data file again
coordinates_data = pd.read_csv(file_path)
coordinates_data.head()

Unnamed: 0,customers_city,address,latitude,longitude
0,"Bengaluru, India","Bengaluru,Karnataka,India",12.978814,77.603557
1,"Hyderabad, India","Hyderabad,Bahadurpura mandal,Telangana,500 002...",17.360589,78.474061
2,"Pune, India","Pune City,Pune District,Maharashtra,411001,India",18.528371,73.876363


### success! as you can see our function is only sending request to api when the location is not in 'coordinates_data.csv', and new location is added into 'coordinates_data.csv'

# 6. Appying our "find_lat_lng_if_not_exists()" function to actual list of customer locations

In [13]:
# creating a list of unique locations of our customers

unique_locations_list = list(df['customers_city'].unique())

In [14]:
len(unique_locations_list)

8986

In [15]:
# visualizing list

unique_locations_list[20:30]

['Belagavi, India',
 'Ghazipur District, India',
 'Sultanpur District, India',
 'Kamrup District, India',
 'Punalur, India',
 'Firozabad District, India',
 'Gulaothi, India',
 'Baduria, India',
 'Shimla District, India',
 'Bhiwandi, India']

In [16]:
# inserting unique_locations_list into our function to scrape coordinates for each location.

find_lat_lng_if_not_exists(unique_locations_list)

request_no:8897 | customers city:Bangalore560003, India |  latitude:12.979119799677022 | longitude:77.5912997
request_no:8898 | customers city:Brehmapur, India |  latitude:19.309813400335003 | longitude:84.79715619999999
request_no:8899 | customers city:Akbarpur, Kanpur Dehat, India |  latitude:26.500000000112195 | longitude:80.0
request_no:8900 | customers city:Ghazaibad, India |  latitude:28.76876534351088 | longitude:77.47634117925475
request_no:8901 | customers city:Tumakur, India |  latitude:13.52162460255565 | longitude:76.94777823235009
request_no:8902 | customers city:Shivajinagar, India |  latitude:18.568605712019647 | longitude:73.7693997240562
request_no:8903 | customers city:Kuthankuzhi, India |  latitude:22.351114800402986 | longitude:78.6677428
request_no:8904 | customers city:Annanji, India |  latitude:10.03651979943981 | longitude:77.51121379999998
request_no:8905 | customers city:Rajiv Nagar, India |  latitude:17.23075694519034 | longitude:78.43203129614889
request_no:

request_no:8973 | customers city:Vadagaon, India |  latitude:15.833956400005295 | longitude:74.5201406
request_no:8974 | customers city:Ghanaur, India |  latitude:30.327593999711524 | longitude:76.60620799999998
request_no:8975 | customers city:Tribeni, India |  latitude:22.99030300038439 | longitude:88.3982606
request_no:8976 | customers city:District Dhule, India |  latitude:21.110920050181196 | longitude:74.60056375307022
request_no:8977 | customers city:Khalari, India |  latitude:20.6435832003961 | longitude:81.1534366
request_no:8978 | customers city:Mograhat, India |  latitude:22.325755700403484 | longitude:88.3500827
request_no:8979 | customers city:Buxr, India |  latitude:19.180714100326714 | longitude:72.9673476
request_no:8980 | customers city:Kriparampur, India |  latitude:22.351114800402986 | longitude:78.6677428
request_no:8981 | customers city:Jahazpur, India |  latitude:25.62149982076838 | longitude:75.27054895606231
request_no:8982 | customers city:Parawada, India |  la

# 7. Lets see scraped addresses and coordinates

In [17]:
coordinates_data = pd.read_csv(file_path)
coordinates_data.head()

Unnamed: 0,customers_city,address,latitude,longitude
0,"Bengaluru, India","Bengaluru,Karnataka,India",12.978814,77.603557
1,"Hyderabad, India","Hyderabad,Bahadurpura mandal,Telangana,500 002...",17.360589,78.474061
2,"Pune, India","Pune City,Pune District,Maharashtra,411001,India",18.528371,73.876363
3,"Mumbai, India","Mumbai,Mumbai Suburban,Maharashtra,400070,India",19.07599,72.877393
4,"Karimpur, India","Karimpur,Karimpur-I,Nadia,West Bengal,741152,I...",23.981603,88.62961


In [18]:
len(coordinates_data)

9006

In [19]:
coordinates_data.isna().sum()

customers_city      0
address           344
latitude          344
longitude         344
dtype: int64

# 8. Adding new feature "states" of customers from their addresses

In [20]:
states_of_india = [
    "Andhra Pradesh",
    "Arunachal Pradesh",
    "Assam",
    "Bihar",
    "Chhattisgarh",
    "Goa",
    "Gujarat",
    "Haryana",
    "Himachal Pradesh",
    "Jharkhand",
    "Karnataka",
    "Kerala",
    "Madhya Pradesh",
    "Maharashtra",
    "Manipur",
    "Meghalaya",
    "Mizoram",
    "Nagaland",
    "Odisha",
    "Punjab",
    "Rajasthan",
    "Sikkim",
    "Tamil Nadu",
    "Telangana",
    "Tripura",
    "Uttarakhand",
    "Uttar Pradesh",
    "West Bengal",
    "Andaman and Nicobar Islands",
    "Chandigarh",
    "Dadra and Nagar Haveli and Daman & Diu",
    "Delhi",
    "Jammu & Kashmir",
    "Ladakh",
    "Lakshadweep",
    "Puducherry"
]

In [21]:
def find_state(address):
    s = np.nan
    for state in states_of_india:
        if state in str(address):
            s = state
        else:
            pass
    return s

In [22]:
coordinates_data['state'] = coordinates_data['address'].map(find_state)

In [23]:
coordinates_data.head(10)

Unnamed: 0,customers_city,address,latitude,longitude,state
0,"Bengaluru, India","Bengaluru,Karnataka,India",12.978814,77.603557,Karnataka
1,"Hyderabad, India","Hyderabad,Bahadurpura mandal,Telangana,500 002...",17.360589,78.474061,Telangana
2,"Pune, India","Pune City,Pune District,Maharashtra,411001,India",18.528371,73.876363,Maharashtra
3,"Mumbai, India","Mumbai,Mumbai Suburban,Maharashtra,400070,India",19.07599,72.877393,Maharashtra
4,"Karimpur, India","Karimpur,Karimpur-I,Nadia,West Bengal,741152,I...",23.981603,88.62961,West Bengal
5,"Ranchi, India","Jharkhand,Kanke,Ranchi,India",23.455981,85.25573,Jharkhand
6,"Sidhi, India","Sidhi,Gopadbanas Tahsil,Madhya Pradesh,486600,...",24.410892,81.879469,Madhya Pradesh
7,"Kosi Kalan, India","Kosi Kalan,Chhata,Mathura,Uttar Pradesh,281403...",27.792541,77.43679,Uttar Pradesh
8,"Ballia, India","Ballia,Uttar Pradesh,277304,India",25.876567,84.102347,Uttar Pradesh
9,"Dimapur, India","Dimapur,Dimapur Sadar,Nagaland,797112,India",25.913646,93.728346,Nagaland


In [24]:
coordinates_data['state'].isnull().sum()

735

# 9. Merging main "df" with "coordinates_data" on primery key 'customers_city'

In [25]:
df = df.merge(coordinates_data, on='customers_city', how='left')

In [26]:
df.head()

Unnamed: 0,prod_id,product_name,brand_name,category,price,sold,prod_url,customer_name,purchase_date,customers_city,rating,comment_head,comment,purchase_month,purchase_year,short_name,address,latitude,longitude,state
0,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,2023-01-01,"Karimpur, India",5,Great product,Very good product 🙂🎈🎈🎈🎈🎈,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...,"Karimpur,Karimpur-I,Nadia,West Bengal,741152,I...",23.981603,88.62961,West Bengal
1,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,2023-01-01,"Ranchi, India",4,Really Nice,nice,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...,"Jharkhand,Kanke,Ranchi,India",23.455981,85.25573,Jharkhand
2,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Dhiraj Jaiswal,2023-01-01,"Sidhi, India",3,Does the job,Good,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...,"Sidhi,Gopadbanas Tahsil,Madhya Pradesh,486600,...",24.410892,81.879469,Madhya Pradesh
3,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Vasamsetti Durgayya,2023-01-01,"Hyderabad, India",5,Best in the market!,Good product at this price.i am very happy.per...,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...,"Hyderabad,Bahadurpura mandal,Telangana,500 002...",17.360589,78.474061,Telangana
4,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999.0,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,2023-01-01,"Bengaluru, India",1,Did not meet expectations,Worst laptop don't buy this laptop,1,2023,acer Aspire 3 Core i5 11th Gen - (8 GB/512...,"Bengaluru,Karnataka,India",12.978814,77.603557,Karnataka


# 10. Saving new data in csv

In [27]:
df.to_csv(r"D:\flipkart reviews\all csv combine\added_coordinates_reviews.csv", index=False)

# 11. lets see Sale distribution on map of india

In [None]:
df = read_csv(r"D:\flipkart reviews\all csv combine\added_coordinates_reviews.csv")

In [28]:
df2 = df[['customers_city']].value_counts().to_frame().reset_index()
df2.rename(columns={0:'sales'},inplace=True)
df2.head()

Unnamed: 0,customers_city,sales
0,"New Delhi, India",32257
1,"Bengaluru, India",31874
2,"Hyderabad, India",27456
3,"Chennai, India",20927
4,"Mumbai, India",13839


In [29]:
sales = df2.merge(coordinates_data, on='customers_city', how='left').dropna()

In [30]:
sales.head()

Unnamed: 0,customers_city,sales,address,latitude,longitude,state
0,"New Delhi, India",32257,"New Delhi,Chanakya Puri Tehsil,110004,India",28.613895,77.209006,Delhi
1,"Bengaluru, India",31874,"Bengaluru,Karnataka,India",12.978814,77.603557,Karnataka
2,"Hyderabad, India",27456,"Hyderabad,Bahadurpura mandal,Telangana,500 002...",17.360589,78.474061,Telangana
3,"Chennai, India",20927,"Chennai,CMWSSB Division 58,Chennai District,Ta...",13.083694,80.270186,Tamil Nadu
4,"Mumbai, India",13839,"Mumbai,Mumbai Suburban,Maharashtra,400070,India",19.07599,72.877393,Maharashtra


In [31]:
import folium
from folium.plugins import HeatMap

In [32]:
def generatebasemap(default_location=[22.9734 , 78.6569],default_zoom_start=5):
    basemap = folium.Map(location=default_location , zoom_start=default_zoom_start)
    return basemap

In [33]:
basemap = generatebasemap()
HeatMap(sales[['latitude','longitude','sales']].values.tolist(),
        zoom=20,
        radius=15,
        min_opacity=0.4, 
        max_opacity=1).add_to(basemap)

basemap