In [24]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import networkx as nx
import itertools
import csv

In [54]:
def prepare_csv_node_list(data, threshold=0):
    # type 1 is drugs, type 2 is conditions, numbered for gephi
    print("Generating node list:")
    drugs = data['drugName'].value_counts()
    drug = drugs.loc[lambda x: x>threshold]
    conditions = data['condition'].value_counts()
    node_list = [("ID", "label", "type")]
    print("enumerating drugs")
    for drug, _ in drugs.items():
        drugID = data.loc[data['drugName'] == drug]["drugID"].values[0]
        node_list.append((drugID, drug, 1))
    print("enumerating conditions")
    for condition,_ in conditions.items():
        conditionID = data.loc[data['condition'] == condition]["conditionID"].values[0]
        node_list.append((conditionID, condition, 2))
    print("DONE")
    return node_list
    
# def long_prepare_csv_edge_list(data, threshold=0):   
#     prog_bar_counter = 0
#     drug_list = data['drugName'].value_counts()
#     drug_list = drug_list.loc[lambda x: x>threshold]
#     num_of_drugs = drug_list.shape[0]
#     edge_list_repeats = [("Source", "Target", "Type", "Weight")]
#     edge_list_singular = [("Source", "Target", "Type", "Weight")]
#     print("Progress:\n[", end='')
#     for drug, drug_count in drug_list.items():
#         drugID = data.loc[data['drugName'] == drug]["drugID"].values[0]
#         conditionID = data.loc[data['condition'] == condition]["conditionID"].values[0]
#         # all of the conditions per drug
#         conds_series = data.loc[data['drugName'] == drug]['condition'].value_counts()

#         for cond, cond_count in conds_series.items():
#             # filter df per condition to get df with only the same condition and drug in the loop
#             filtered_df = data.loc[(data['drugName'] == drug) & (data['condition'] == cond)]
#             # for the long version, iterate over all instances
#             for index, row in filtered_df.iterrows():
#                 edge_list_repeats.append((row["drugID"], row["conditionID"], "Undireccted", row["weight"]))       
#             #sum = filtered_df['rating'].sum()
#             #avg = sum / filtered_df.shape[0]
#             # short version with no repeats
#             edge_list_singular.append((drugID, conditionID, "Undirected", 1))
#         if prog_bar_counter % (num_of_drugs/100) == 0:
#             print(".", end='')
#         prog_bar_counter += 1
#     print("]\nDone!")
#     return (edge_list_singular, edge_list_repeats)

def prepare_edge_list(data):
    drug_list = data['drugName'].unique()
    edges = [("Source", "Target", "Type", "Weight")]
    print("Generating edge list:\n[", end='')
    for drug in drug_list:
        drugID = data.loc[data['drugName'] == drug]["drugID"].values[0]
        unique_conditions = data.loc[data['drugName'] == drug]['condition'].unique()
        for condition in unique_conditions:
            if condition:
                conditionID = data.loc[data['condition'] == condition]["conditionID"].values[0]
                weight = data.loc[data['condition'] == condition]["weight"].values[0]
                edges.append((int(drugID), int(conditionID), "Undirected", weight))
        print(".", end='')
    print("]\nDone!")
    return edges
        
        
def add_id_to_df(data):
    data['drugID'] = data.groupby(['drugName']).ngroup()
    data['conditionID'] = data.groupby(['condition']).ngroup()
    # data['conditionID'] += data.loc[data['drugID'].idxmax()]['drugID']
    data['conditionID'] += 10000
    data['concitionID'] = data['conditionID'].astype(int)
def add_weight_to_df(data):
    data['weight'] = 1
    #add function here for actual weight
def clean_df(data):
    #redundent Axes
    data.drop('Unnamed: 0', axis=1, inplace=True)
    data.drop('date', axis=1, inplace=True)
    data.drop('review', axis=1, inplace=True)
    # corrupted data
    data.drop(data[data['condition'].str.contains("<", na=False)].index, inplace=True)
    data.drop(data[data['condition'].str.contains("Not Listed", na=False)].index, inplace=True)
    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)
    #print(data.shape[0])
    return data
    

In [52]:
with open('train.jsonl') as f:
    df_raw = pd.DataFrame(json.loads(line) for line in f)
raw_size = df_raw.shape[0]
df = clean_df(df_raw)
print(f"removed {raw_size-df.shape[0]} rows")
add_id_to_df(df)
add_weight_to_df(df)
print("Added ID to drugs and ID to conditions")
df = df[['drugID', 'drugName', 'conditionID', 'condition', 'rating', 'usefulCount', 'weight']]
df

removed 2238 rows
Added ID to drugs and ID to conditions


Unnamed: 0,drugID,drugName,conditionID,condition,rating,usefulCount,weight
0,3175,Valsartan,10393,Left Ventricular Dysfunction,9.0,27,1
1,1419,Guanfacine,10000,ADHD,8.0,192,1
2,1839,Lybrel,10092,Birth Control,5.0,17,1
3,2263,Ortho Evra,10092,Birth Control,8.0,10,1
4,512,Buprenorphine / naloxone,10500,Opiate Dependence,9.0,37,1
...,...,...,...,...,...,...,...
159054,551,Campral,10019,Alcohol Dependence,10.0,125,1
159055,1958,Metoclopramide,10461,Nausea/Vomiting,1.0,34,1
159056,2257,Orencia,10627,Rheumatoid Arthritis,2.0,35,1
159057,3002,Thyroid desiccated,10738,Underactive Thyroid,10.0,79,1


In [53]:
df.to_csv("data_clean.csv", encoding='utf-8', index=False)

In [None]:
# testing
data = df
condition = "ADHD"
drug = "Aripiprazole"
data.loc[data['condition'] == condition]["conditionID"].values[0]
test_lst = list(data.loc[data['drugName'] == drug]['condition'].unique())
for test in test_lst:
    if test:
        print(data.loc[data['condition'] == test]["conditionID"].values[0])
#df[df['condition'].str.contains("Listed", na=False)]
#df[df['condition'].str.contains("None", na=False)]

In [55]:
node_list = prepare_csv_node_list(df)
edge_list = prepare_edge_list(df)

Generating node list:
enumerating drugs
enumerating conditions
DONE
Generating edge list:
[.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [58]:
for i in range(10):
    print(node_list[i])
print("-"*50)
for i in range(10):
    print(edge_list[i])

('ID', 'label', 'type')
(1734, 'Levonorgestrel', 1)
(1180, 'Etonogestrel', 1)
(1175, 'Ethinyl estradiol / norethindrone', 1)
(2132, 'Nexplanon', 1)
(1176, 'Ethinyl estradiol / norgestimate', 1)
(1173, 'Ethinyl estradiol / levonorgestrel', 1)
(2396, 'Phentermine', 1)
(2780, 'Sertraline', 1)
(1141, 'Escitalopram', 1)
--------------------------------------------------
('Source', 'Target', 'Type', 'Weight')
(3175, 10393, 'Undirected', 1)
(3175, 10313, 'Undirected', 1)
(1419, 10000, 'Undirected', 1)
(1419, 10313, 'Undirected', 1)
(1419, 10722, 'Undirected', 1)
(1839, 10092, 'Undirected', 1)
(1839, 10234, 'Undirected', 1)
(2263, 10092, 'Undirected', 1)
(512, 10500, 'Undirected', 1)


In [60]:
with open('gephi_node_list.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(node_list)
with open('gephi_edge_list.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(edge_list)