In [136]:
import pandas as pd
import seaborn as sns
import sqlite3, hashlib, requests, json, uuid

# Collect to and query the DB for all of the peers
conn = sqlite3.connect('../test.db')
query = "SELECT * FROM peers;"

df = pd.read_sql_query(query,conn)

In [50]:
# Total Unique IPs
print(f"Total Unique IPs: {len(df.address.unique())} Total IPs: {len(df.address)}")

# Getting a df of unique IP+torrent combos only
## Hash the IP + torrent name concatenated
## Insert hash as own column
## Select only unique hashes

df["ip_tor_hash"] = df["address"] + df["torrent_name"]
df["ip_tor_hash"] = df["ip_tor_hash"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())

u_df = df.drop_duplicates(subset=['ip_tor_hash']) # Unique df

Total Unique IPs: 114 Total IPs: 1044


In [63]:
# Get IP Address information from ipinfo.io
## e00976a117244e
## Get the responses into a larger object that we can iterate through after the queries

ips = {}

for ip in u_df.address:
    r = requests.get(f"https://ipinfo.io/{ip}?token=e00976a117244e")
    data = json.loads(r.text)
    ips[ip] = data
    #break

In [68]:
# Save our IPs to a file for now so we do not lose them and waste API calls on duplications
with open(f"ips_{str(uuid.uuid4())}.json", "w") as f:
    f.write(json.dumps(ips))
print(ips)


In [118]:
# Write a parser for the IP entries that returns a dictionary for the
# Calling function to then insert into the df
def ip_parser(ip):
    """
    Takes one IP from the ips dictionary that was created.
    ip.addr.is.here : {
        ip: ip.addr.is.here,
        county: etc....
    }
    """
    cleaned_ip = {
        "ip": "", "hostname": "", "city": "", 
        "region": "", "country": "", "loc": "",
        "org": "", "postal": "", "timezone": ""
        }
    
    for k,v in ip.items():
        cleaned_ip[k] = v
        
    return { cleaned_ip["ip"]: {
        "address": cleaned_ip["ip"],
        "hostname": cleaned_ip["hostname"],
        "city": cleaned_ip["city"],
        "region": cleaned_ip["region"],
        "country": cleaned_ip["country"],
        "loc": cleaned_ip["loc"],
        "org": cleaned_ip["org"],
        "postal": cleaned_ip["postal"],
        "timezone": cleaned_ip["timezone"]
        }
    }


In [119]:
# Enter default empty strings for missing values for data clarity. Convert into its own dataframe for insertion into main DF
clean_ip_list = {}
for ip in ips:    
    clean_ip_list[ip] = ip_parser(ips[ip])[ip]
    
ipdf = pd.DataFrame.from_dict(clean_ip_list).T

In [123]:
# Merge into a single df with u_df
mu_df = pd.merge(u_df, ipdf, on="address")

In [145]:
#mu_df.sample(8)

In [146]:
mu_df.country.value_counts()

US    30
DE    14
RU     9
NL     7
FR     6
GB     6
SE     4
JP     3
CA     3
ES     3
FI     3
UA     2
NO     2
HU     2
AT     2
CZ     2
AR     2
BR     2
AU     2
CH     2
IT     1
SK     1
CN     1
KR     1
VN     1
IN     1
LT     1
LV     1
AZ     1
Name: country, dtype: int64