In [None]:
import requests
import pandas as pd
import time

def fetch_ip_data(df):
    url = "http://ip-api.com/batch?fields=status,message,countryCode,lat,lon,query"
    batch_size = 100  # Max IPs per request
    total_ips = len(df)
    processed_ips = 0  # Debug counter
    results = []

    for i in range(0, total_ips, batch_size):
        batch = df[i:i + batch_size]["ipAddress"].tolist()

        for _ in range(3):  # Retry up to 3 times on failure
            try:
                response = requests.post(url, json=batch, timeout=10)

                if response.status_code == 429:
                    retry_after = int(response.headers.get("X-Ttl", 60))
                    print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                    time.sleep(retry_after)
                    continue

                if response.status_code == 200:
                    data = response.json()
                    results.extend(data)
                    processed_ips += len(batch)  # Update counter
                    print(f"Processed {processed_ips}/{total_ips} IPs")  # Debug info
                    break  # Exit retry loop on success
                else:
                    print(f"Error: {response.status_code}")
                    time.sleep(2)  # Wait before retrying

            except requests.exceptions.ConnectionError:
                print("Connection error. Retrying in 5 seconds...")
                time.sleep(5)
            except requests.exceptions.RequestException as e:
                print(f"Request failed: {e}")
                time.sleep(2)

    return pd.DataFrame(results)

ips = pd.read_csv("/content/ips.csv")


In [None]:
unique_ips = ips["ipAddress"].unique()
print(unique_ips.shape)

(44967,)


In [None]:
unique_ips = pd.DataFrame(unique_ips,columns=["ipAddress"])

In [None]:
unique_ips[unique_ips.duplicated()]

Unnamed: 0,ipAddress


In [None]:
result_df1 = fetch_ip_data(unique_ips[:100])
result_df1

Processed 100/100 IPs


Unnamed: 0,status,countryCode,lat,lon,query
0,success,PK,33.7215,73.0433,203.215.165.236
1,success,PK,31.2024,73.9486,223.123.0.46
2,success,PK,31.5580,74.3587,103.134.3.130
3,success,PK,24.8607,67.0011,119.155.188.8
4,success,PK,28.5466,68.2231,103.120.117.148
...,...,...,...,...,...
95,success,PK,24.9239,67.1423,223.123.115.231
96,success,PK,32.8247,73.8782,39.34.234.79
97,success,PK,30.0726,71.1938,223.123.3.152
98,success,PK,27.7070,68.8525,39.63.85.86


In [None]:
result_df1[result_df1.duplicated()]

Unnamed: 0,status,country,lat,lon,query,message


In [None]:
dfNew = unique_ips[33667:]
dfNew[dfNew.duplicated()]

Unnamed: 0,ipAddress


In [None]:
dfNew

Unnamed: 0,ipAddress
33667,49.37.156.251
33668,152.58.147.222
33669,106.221.185.171
33670,152.59.143.84
33671,106.206.236.228
...,...
44962,49.126.245.5
44963,120.89.104.21
44964,27.34.64.217
44965,150.107.106.55


In [None]:
result_df2 = fetch_ip_data(dfNew)
result_df2

Processed 100/11300 IPs
Processed 200/11300 IPs
Processed 300/11300 IPs
Processed 400/11300 IPs
Processed 500/11300 IPs
Processed 600/11300 IPs
Processed 700/11300 IPs
Processed 800/11300 IPs
Processed 900/11300 IPs
Processed 1000/11300 IPs
Processed 1100/11300 IPs
Processed 1200/11300 IPs
Processed 1300/11300 IPs
Processed 1400/11300 IPs
Processed 1500/11300 IPs
Rate limit exceeded. Retrying after 59 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds

Unnamed: 0,status,country,lat,lon,query
0,success,India,17.3843,78.4583,49.37.156.251
1,success,India,24.8081,93.9442,152.58.147.222
2,success,India,17.3843,78.4583,106.221.185.171
3,success,India,25.5943,85.1352,152.59.143.84
4,success,India,30.7339,76.7889,106.206.236.228
...,...,...,...,...,...
8695,success,Nepal,26.4550,87.2701,49.126.245.5
8696,success,Nepal,27.7108,85.3251,120.89.104.21
8697,success,Nepal,27.7108,85.3251,27.34.64.217
8698,success,Nepal,26.7202,86.4826,150.107.106.55


In [None]:
result_df2[result_df2.duplicated()]

Unnamed: 0,status,country,lat,lon,query


In [None]:
dfNew1 = dfNew[8700:]
dfNew1[dfNew1.duplicated()]

Unnamed: 0,ipAddress


In [None]:
dfNew1

Unnamed: 0,ipAddress
42367,103.178.189.11
42368,223.239.88.177
42369,43.246.202.150
42370,103.181.74.11
42371,103.140.212.79
...,...
44962,49.126.245.5
44963,120.89.104.21
44964,27.34.64.217
44965,150.107.106.55


In [None]:
result_df3 = fetch_ip_data(dfNew1)
result_df3

Processed 100/2600 IPs
Processed 200/2600 IPs
Processed 300/2600 IPs
Processed 400/2600 IPs
Processed 500/2600 IPs
Processed 600/2600 IPs
Processed 700/2600 IPs
Processed 800/2600 IPs
Processed 900/2600 IPs
Processed 1000/2600 IPs
Processed 1100/2600 IPs
Processed 1200/2600 IPs
Processed 1300/2600 IPs
Processed 1400/2600 IPs
Processed 1500/2600 IPs
Rate limit exceeded. Retrying after 59 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit 

Unnamed: 0,status,country,lat,lon,query,message
0,success,Bangladesh,23.5931,90.1425,103.178.189.11,
1,success,India,23.8354,91.2818,223.239.88.177,
2,success,Bangladesh,23.6500,90.6167,43.246.202.150,
3,success,Bangladesh,24.8060,89.3138,103.181.74.11,
4,success,Bangladesh,22.3384,91.8317,103.140.212.79,
...,...,...,...,...,...,...
1995,success,Nepal,26.4550,87.2701,49.126.245.5,
1996,success,Nepal,27.7108,85.3251,120.89.104.21,
1997,success,Nepal,27.7108,85.3251,27.34.64.217,
1998,success,Nepal,26.7202,86.4826,150.107.106.55,


In [None]:
result_df3[result_df3.duplicated()]

Unnamed: 0,status,country,lat,lon,query,message


In [None]:
dfNew2 = dfNew1[2000:]
dfNew2[dfNew2.duplicated()]

Unnamed: 0,ipAddress


In [None]:
dfNew2

Unnamed: 0,ipAddress
44367,36.252.248.197
44368,27.34.64.251
44369,27.34.72.117
44370,27.34.79.37
44371,157.15.58.82
...,...
44962,49.126.245.5
44963,120.89.104.21
44964,27.34.64.217
44965,150.107.106.55


In [None]:
result_df4 = fetch_ip_data(dfNew2)
result_df4

Processed 100/600 IPs
Processed 200/600 IPs
Processed 300/600 IPs
Processed 400/600 IPs
Processed 500/600 IPs
Processed 600/600 IPs


Unnamed: 0,status,country,lat,lon,query
0,success,Nepal,28.2669,83.9685,36.252.248.197
1,success,Nepal,27.7108,85.3251,27.34.64.251
2,success,Nepal,27.7108,85.3251,27.34.72.117
3,success,Nepal,27.7108,85.3251,27.34.79.37
4,success,Nepal,27.5776,83.6989,157.15.58.82
...,...,...,...,...,...
595,success,Nepal,26.4550,87.2701,49.126.245.5
596,success,Nepal,27.7108,85.3251,120.89.104.21
597,success,Nepal,27.7108,85.3251,27.34.64.217
598,success,Nepal,26.7202,86.4826,150.107.106.55


In [None]:
uniqueIPs = pd.concat([result_df1, result_df2, result_df3, result_df4]).drop_duplicates(subset=["query"])
print("Final unique IP count:", uniqueIPs.shape[0])
uniqueIPs

Final unique IP count: 36667


Unnamed: 0,status,country,lat,lon,query,message
0,success,Pakistan,33.7215,73.0433,203.215.165.236,
1,success,Pakistan,31.2024,73.9486,223.123.0.46,
2,success,Pakistan,31.5580,74.3587,103.134.3.130,
3,success,Pakistan,24.8607,67.0011,119.155.188.8,
4,success,Pakistan,28.5466,68.2231,103.120.117.148,
...,...,...,...,...,...,...
8028,success,Nepal,27.7108,85.3251,27.34.65.69,
8029,success,Nepal,27.7108,85.3251,27.34.68.142,
8030,success,Nepal,26.8566,85.5594,202.51.92.90,
8031,success,Nepal,27.7108,85.3251,113.199.228.79,


In [None]:
# Convert to sets for efficient comparison
original_ips_set = set(unique_ips['ipAddress'])
processed_ips_set = set(uniqueIPs['query'])

# Find IPs that are in the original set but not in the processed set
missing_ips = original_ips_set - processed_ips_set

In [None]:
# prompt: convert set missing_ips to dataframe

missing_ips_df = pd.DataFrame(list(missing_ips), columns=['ipAddress'])
missing_ips_df


Unnamed: 0,ipAddress
0,111.88.36.107
1,119.155.215.68
2,37.111.159.250
3,119.155.203.0
4,110.224.115.126
...,...
8295,103.111.39.65
8296,39.44.145.237
8297,182.190.206.136
8298,39.59.96.103


In [None]:
result_df5 = fetch_ip_data(missing_ips_df)
result_df5

Processed 100/8300 IPs
Processed 200/8300 IPs
Processed 300/8300 IPs
Processed 400/8300 IPs
Processed 500/8300 IPs
Processed 600/8300 IPs
Processed 700/8300 IPs
Processed 800/8300 IPs
Processed 900/8300 IPs
Processed 1000/8300 IPs
Processed 1100/8300 IPs
Processed 1200/8300 IPs
Processed 1300/8300 IPs
Processed 1400/8300 IPs
Processed 1500/8300 IPs
Rate limit exceeded. Retrying after 59 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit 

Unnamed: 0,status,country,lat,lon,query
0,success,Pakistan,24.8794,66.9920,111.88.36.107
1,success,Pakistan,24.8607,67.0011,119.155.215.68
2,success,Pakistan,27.9368,69.3194,37.111.159.250
3,success,Pakistan,24.8608,67.0104,119.155.203.0
4,success,India,19.0748,72.8856,110.224.115.126
...,...,...,...,...,...
6195,success,Pakistan,33.5973,73.0479,103.111.39.65
6196,success,Pakistan,34.0083,71.5812,39.44.145.237
6197,success,Pakistan,24.8591,66.9983,182.190.206.136
6198,success,Pakistan,31.3709,73.0336,39.59.96.103


In [None]:
result_df6 = fetch_ip_data(missing_ips_df[6200:])
result_df6

Processed 100/2100 IPs
Processed 200/2100 IPs
Processed 300/2100 IPs
Processed 400/2100 IPs
Processed 500/2100 IPs
Processed 600/2100 IPs
Processed 700/2100 IPs
Processed 800/2100 IPs
Processed 900/2100 IPs
Processed 1000/2100 IPs
Processed 1100/2100 IPs
Processed 1200/2100 IPs
Processed 1300/2100 IPs
Processed 1400/2100 IPs
Processed 1500/2100 IPs
Rate limit exceeded. Retrying after 59 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit 

Unnamed: 0,status,country,lat,lon,query
0,success,Pakistan,31.5826,74.3276,223.123.20.61
1,success,Pakistan,24.8591,66.9983,39.50.183.148
2,success,Pakistan,24.8608,67.0104,37.111.159.196
3,success,India,21.1981,72.8298,152.59.35.184
4,success,Pakistan,32.2722,72.9390,124.29.254.83
...,...,...,...,...,...
1595,success,Pakistan,33.5973,73.0479,103.111.39.65
1596,success,Pakistan,34.0083,71.5812,39.44.145.237
1597,success,Pakistan,24.8591,66.9983,182.190.206.136
1598,success,Pakistan,31.3709,73.0336,39.59.96.103


In [None]:
d = missing_ips_df[6200:]
result_df7 = fetch_ip_data(d[1600:])
result_df7

Processed 100/500 IPs
Processed 200/500 IPs
Processed 300/500 IPs
Processed 400/500 IPs
Processed 500/500 IPs


Unnamed: 0,status,country,lat,lon,query
0,success,India,28.8395,78.7699,157.39.15.211
1,success,Pakistan,31.7209,72.9784,37.111.145.116
2,success,India,20.2706,85.8334,152.59.155.90
3,success,India,26.8373,80.9165,152.59.191.27
4,success,Pakistan,34.4377,73.2237,223.123.86.20
...,...,...,...,...,...
495,success,Pakistan,33.5973,73.0479,103.111.39.65
496,success,Pakistan,34.0083,71.5812,39.44.145.237
497,success,Pakistan,24.8591,66.9983,182.190.206.136
498,success,Pakistan,31.3709,73.0336,39.59.96.103


In [None]:
uniqueIPs = pd.concat([uniqueIPs, result_df5, result_df6,result_df7]).drop_duplicates(subset=["query"])
print("Final unique IP count:", uniqueIPs.shape[0])

Final unique IP count: 43267


In [None]:
# Convert to sets for efficient comparison
original_ips_set = set(unique_ips['ipAddress'])
processed_ips_set = set(uniqueIPs['query'])

# Find IPs that are in the original set but not in the processed set
missing_ips = original_ips_set - processed_ips_set

In [None]:
len(missing_ips)

1700

In [None]:
missing_ips_df = pd.DataFrame(list(missing_ips), columns=['ipAddress'])
missing_ips_df

Unnamed: 0,ipAddress
0,112.79.119.181
1,49.36.64.146
2,119.155.217.169
3,59.103.27.126
4,117.254.232.60
...,...
1695,223.123.106.7
1696,182.181.39.37
1697,119.155.222.82
1698,154.80.27.215


In [None]:
result_df8 = fetch_ip_data(missing_ips_df)
result_df8

Processed 100/1700 IPs
Processed 200/1700 IPs
Processed 300/1700 IPs
Processed 400/1700 IPs
Processed 500/1700 IPs
Processed 600/1700 IPs
Processed 700/1700 IPs
Processed 800/1700 IPs
Processed 900/1700 IPs
Processed 1000/1700 IPs
Processed 1100/1700 IPs
Processed 1200/1700 IPs
Processed 1300/1700 IPs
Processed 1400/1700 IPs
Processed 1500/1700 IPs
Rate limit exceeded. Retrying after 59 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...
Rate limit exceeded. Retrying after 0 seconds...


Unnamed: 0,status,country,lat,lon,query
0,success,India,26.7084,88.4318,112.79.119.181
1,success,India,23.0276,72.5871,49.36.64.146
2,success,Pakistan,24.8607,67.0011,119.155.217.169
3,success,Pakistan,34.1159,72.4698,59.103.27.126
4,success,India,23.2505,77.4065,117.254.232.60
...,...,...,...,...,...
1495,success,Pakistan,24.8591,66.9983,111.88.103.172
1496,success,Pakistan,32.2722,72.9390,103.120.71.99
1497,success,Pakistan,31.3709,73.0336,154.81.240.234
1498,success,Pakistan,32.4747,74.4498,144.48.135.60


In [None]:
result_df9 = fetch_ip_data(missing_ips_df[1500:])
result_df9

Processed 100/200 IPs
Processed 200/200 IPs


Unnamed: 0,status,country,lat,lon,query
0,success,Pakistan,24.8591,66.9983,154.198.97.255
1,success,Pakistan,31.5826,74.3276,154.80.47.54
2,success,Pakistan,24.8608,67.0104,119.155.165.95
3,success,Pakistan,24.8741,67.1906,103.12.122.76
4,success,India,27.1823,78.0252,106.205.168.240
...,...,...,...,...,...
195,success,Pakistan,24.8841,67.1504,223.123.106.7
196,success,Pakistan,32.4858,74.5370,182.181.39.37
197,success,Pakistan,24.8607,67.0011,119.155.222.82
198,success,Pakistan,31.5826,74.3276,154.80.27.215


In [None]:
uniqueIPs = pd.concat([uniqueIPs, result_df8, result_df6,result_df9]).drop_duplicates(subset=["query"])
print("Final unique IP count:", uniqueIPs.shape[0])

Final unique IP count: 44967


In [None]:
# Convert to sets for efficient comparison
original_ips_set = set(unique_ips['ipAddress'])
processed_ips_set = set(uniqueIPs['query'])

# Find IPs that are in the original set but not in the processed set
missing_ips = original_ips_set - processed_ips_set

In [None]:
missing_ips

set()

In [None]:
uniqueIPs.to_csv("/content/uniqueIPs.csv", index=False)

In [None]:
!pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [None]:
import requests
import pandas as pd
import time

def fetch_ip_data(df):
    url = "http://ip-api.com/batch?fields=status,message,country,countryCode,lat,lon,query"
    batch_size = 100  # Max IPs per request
    total_ips = len(df)

    # Global storage for final results
    final_results_df = pd.DataFrame()

    while not df.empty:  # Keep running until all IPs are processed
        processed_ips = len(final_results_df)  # Debug counter
        results = []

        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i + batch_size]["ip"].tolist()

            for _ in range(3):  # Retry up to 3 times on failure
                try:
                    response = requests.post(url, json=batch, timeout=10)

                    if response.status_code == 429:
                        retry_after = int(response.headers.get("X-Ttl", 60))
                        print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                        time.sleep(retry_after)
                        continue

                    if response.status_code == 200:
                        data = response.json()

                        # Add "location_type" column as "IP Address"
                        for entry in data:
                            entry["location_type"] = "IP Address"

                        results.extend(data)
                        print(f"Processed {processed_ips + len(results)}/{total_ips} IPs")  # Debug info
                        break  # Exit retry loop on success
                    else:
                        print(f"Error: {response.status_code}")
                        time.sleep(2)  # Wait before retrying

                except requests.exceptions.ConnectionError:
                    print("Connection error. Retrying in 5 seconds...")
                    time.sleep(5)
                except requests.exceptions.RequestException as e:
                    print(f"Request failed: {e}")
                    time.sleep(2)

        # Convert results to DataFrame and append to final results
        if results:
            new_results_df = pd.DataFrame(results)
            final_results_df = pd.concat([final_results_df, new_results_df], ignore_index=True)

        # Identify missing IPs and retry for them
        processed_ip_set = set(final_results_df["query"])
        df = df[~df["ip"].isin(processed_ip_set)]  # Keep only missing IPs

    return final_results_df

# Example usage
df = pd.DataFrame({"ip": ["8.8.8.8", "24.48.0.1", "1.1.1.1", "invalid.ip"]})  # Sample data
result_df = fetch_ip_data(df)
print(result_df)
