-
Notifications
You must be signed in to change notification settings - Fork 2
/
fda_reactions_cleaning.py
117 lines (92 loc) · 4.12 KB
/
fda_reactions_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
##### THIS FILE IS FOR DATA CLEANING #####
### IMPORTS ###
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pprint
from collections import Counter
import matplotlib.pyplot as plt
### SETTING UP ENVIRONMENT ###
# make sure mongod is running
client = MongoClient()
labels = client.drugs.drug_labeling
events = client.drugs.adverse_events6
enforcement = client.drugs.enforcement
### STEP 1: FLATTEN THE DATA ###
""" the data is nested JSON and I'd like to get it tabular so I'm going
to flatten it with the function I've created below. This one is extracting
the data related to the patient and the drugs they have in their system. """
def flatten_reactions_data():
""" this function reads in the data from mongodb in chunks because
the size of the data is too large to fit into memory. I read in each
iteration and then extract the data I need and save to csv"""
""" this is how we are going to iterate through the mongodb cursor """
print "here we go............."
for i in range(17,30):
tracker = i
""" num and num_500 are how I am going to iterate through the
cursor by taking slices """
num = i*500
num_500 = num + 500
cursor_events = events.find({})[num:num_500]
documents_events = []
map(lambda x: documents_events.append(x['results']), cursor_events)
# for j in cursor_events:
# documents_events.append(j['results'])
print "done with documents step: " + str(tracker)
""" there are 100 items stored in each document so now I have to
iterate through each document and pull out the specific item. One
document contains 100 items... """
data_events = []
for k in documents_events:
map(lambda x: data_events.append(x), k)
# for l in k:
# data_events.append(l)
print "done with data step: " + str(tracker)
print len(data_events)
""" now we have to start to flatten out the data. The data is
nested JSON so I'm only going to extract the parts that I need.
There are two levels of keys that I'll be extracting from...
first and second """
# test is the list of lists we are going to append to
test = []
# headers are the headers of the list of lists file
headers = ['safetyreportid', 'receivedate', 'serious', 'medicinalproduct', 'drugcharacterization', 'reaction', 'spl_id']
# the list of our keys that aren't nested
first = ['receivedate', 'serious']
# the list of the keys that are nested
second = ['medicinalproduct', 'drugcharacterization']
third = ['spl_id']
keys = ['medicinalproduct', 'drugcharacterization']
test.append(headers)
for i in data_events:
for j in i['patient']['drug']:
if 'openfda' in j:
if j['openfda'] != {}:
if all (k in j for k in keys):
for k in i['patient']['reaction']:
try:
row = []
row.append(i['safetyreportid'])
row.extend((i[m] for m in first))
row.extend((j[n] for n in second))
row.extend((k['reactionmeddrapt'] for reaction in k))
row.extend(j['openfda']['spl_id'])
test.append(row)
except:
print "bad keys"
else:
pass
print len(test)
""" now that we have our data flattened out into list of lists
it is time to save it to a csv file """
data_events = []
import csv
with open ("reactions"+"_"+str(tracker)+".csv", "wb") as f:
writer = csv.writer(f)
try:
writer.writerows(test)
except:
print "data not good..."
print "done with csv step: " + str(tracker)
flatten_reactions_data()