In [1]:
# This notebook generates 50,000 fake phone number nodes and 100,000 relationships between phones to use as data in a Neo4j graph.

In [1]:
import pandas as pd
import random
import datetime

In [3]:
numbers = []

In [4]:
# generate 50,000 fake numbers
while len(numbers) < 50000:
    fake_num = '807' # 847 is a country calling code that is not in use
    for i in range(8):
        fake_num += str(random.randint(0,9))
    if fake_num in numbers:
        continue
    else:
        numbers.append(fake_num)

In [24]:
phones_dict = {'identifier':numbers, 'type':['phone'] * 50000}

In [25]:
phones_df = pd.DataFrame(phones_dict)

In [26]:
phones_df.head()

Unnamed: 0,identifier,type
0,80761012916,phone
1,80799255509,phone
2,80799433482,phone
3,80788009910,phone
4,80765948263,phone


In [27]:
phones_df.to_csv('phone_nodes.csv')

In [30]:
comms = []

In [31]:
# generate random sources and targets for the phone communications
while len(comms) < 100000:
    a = phones_df['identifier'][random.randint(0,len(phones_df['identifier']) - 1)]
    b = phones_df['identifier'][random.randint(0,len(phones_df['identifier']) - 1)]
    if a == b:
        continue
    else:
        comms.append((a, b))

In [49]:
comms_df = pd.DataFrame(comms, columns = ["source", "target"])

In [50]:
comms_df["source_type"] = "phone"

In [51]:
comms_df["target_type"] = "phone"

In [57]:
dates = []

In [58]:
# generate random dates for first_seen and last_seen
for i in comms_df.index:
    first_seen = datetime.date(random.randint(2017,2024), random.randint(1,12),random.randint(1,27))
    last_seen = datetime.date(random.randint(first_seen.year,2024), random.randint(first_seen.month,12),random.randint(first_seen.day + 1,28))
    first_seen = first_seen.strftime('%Y-%m-%d')
    last_seen = last_seen.strftime('%Y-%m-%d')
    dates.append((first_seen, last_seen))

In [62]:
comms_df['first_seen'] = [x[0] for x in dates]

In [63]:
comms_df['last_seen'] = [x[1] for x in dates]

In [69]:
# generate random number of communications
comms_df['times'] = [random.randint(2, 500) for i in range(len(comms_df))]

In [70]:
comms_df['collection'] = 'A1'

In [71]:
comms_df.head()

Unnamed: 0,source,target,source_type,target_type,first_seen,last_seen,times,collection
0,80721253938,80750554110,phone,phone,2018-10-24,2019-12-26,290,A1
1,80743835363,80744600663,phone,phone,2024-09-23,2024-10-27,199,A1
2,80781538605,80721673418,phone,phone,2022-06-05,2023-09-12,422,A1
3,80723004597,80725581062,phone,phone,2019-08-13,2019-09-18,272,A1
4,80725554752,80763351626,phone,phone,2023-03-11,2024-04-16,249,A1


In [72]:
comms_df.to_csv("phone_relationships_v1.csv")

In [4]:
comms_df = pd.read_csv("data//phone_relationships_v1.csv", index_col = 0)

In [22]:
new_comms_df = pd.DataFrame(comms_df[:5])

In [23]:
# updating comms with new comms
new_last_seen = []
for date_1 in new_comms_df['last_seen']:
    start_date = datetime.datetime.strptime(date_1, "%Y-%m-%d")
    end_date = start_date + datetime.timedelta(days=1)
    new_last_seen.append(end_date.strftime("%Y-%m-%d"))

In [24]:
new_comms_df['last_seen'] = new_last_seen

In [25]:
new_times = [x + 1 for x in new_comms_df['times']]

In [26]:
new_comms_df['times'] = new_times

In [28]:
new_comms_df.to_csv("new_comms_update.csv")