-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
246 lines (204 loc) · 7.33 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Load and manage the data to be used in the project
Created on 08/18/2015
'''
__author__='ucaiado'
import pandas as pd
import numpy as np
import csv
import zipfile
import sys
#name global variables
L_FIELDS = ["CNT","ESCS","STIDSTD","PV1MATH","PV1READ","PV1SCIE","BELONG",
"OUTHOURS", "ST28Q01", "ST57Q01"]
S_GDP = "data/OCDE_GDP_percapta.csv"
S_CONT = "data/continents.csv"
'''
Begin of Help Functions
'''
def _getFields(l_fields, row):
'''
Return a dictionary with the data into the row, filtering
the fields passed
'''
return {k: row.get(k, None) for k in l_fields}
def rename_contries(l_names):
"""
rename the name of each string in a list by names pre-selected inside the
function. Return a new list with the names modifieds
"""
set_names = set(l_names)
b=[x.strip() for x in set_names]
d_newNames = dict(zip(b,b))
l2 = ["UAE", "UK", "Serbia", "South Korea", "Taiwan", "Slovakia", "Russia",
"Russia", "Hong Kong","China", "Macau", "USA", "USA","USA", "USA",
"Vietnam", "Macau", "Hong Kong","China", "Norway", "USA","Macau"]
l1 = ["United Arab Emirates","United Kingdom","Serbia","Korea",
"Chinese Taipei","Slovak Republic","Russian Federation",
"Perm(Russian Federation)","Hong Kong-China", "China-Shanghai", "Macau",
"Connecticut (USA)","Florida (USA)", "Massachusetts (USA)",
"United States of America", "Viet Nam", "Macao - China",
"Hong Kong - China", "Shanghai - China", "Norway1", 'United States',
'Macao-China']
for s1, s2 in zip(l1,l2):
if s1 in set_names:
d_newNames[s1] = s2
l_rtn = [d_newNames[x] for x in l_names]
return l_rtn
'''
End of Help Functions
'''
class loadPISA:
'''
Load and handle the dataset used in the project
'''
def __init__(self,s_fname, l_fields=L_FIELDS, s_gdp=S_GDP, s_cont = S_CONT):
'''
Initialize a LoadData instance. Save all parameters as attributes
-------------------------
s_fname: path to the PISA file
l_fields: list of strings with the fields in PISA file desired
s_gdp: the path to GDP file
'''
self.s_file = s_fname
self.df = self._loadData(l_fields)
self.df = self.correctTypes()
self.df_gdp = self._loadGDP(s_gdp)
self.df, self.df_gdp = self._imputGDPinPisa(self.df, self.df_gdp)
self.df_continents = self._loadContinents(s_cont)
self.df = self.setCountriesContinents()
self.df = self.bucketizePisa()
self.df = self.reduceData()
self.df_avgByCountry = self._createGroupedData()
def getContinents(self):
'''
Return a copy of Continents dataframe
'''
return self.df_continents.copy()
def getPisa(self):
'''
Return a copy of Pisa dataframe
'''
return self.df.copy()
def getGDP(self):
'''
Return a copy of Pisa dataframe
'''
return self.df_gdp.copy()
def setCountriesContinents(self):
'''
Return pisa dataframe with the countries classify by continent
'''
df = self.getPisa()
df['continent'] = None
df['continent'] = self.df_continents.loc[self.df.CNT1.values].values
return df
def correctTypes(self):
'''
Correct the data types in pisa dataframe
'''
#load data
df = self.getPisa()
#correct datatypes
df.PV1MATH = df.PV1MATH.astype(float)
df.ESCS[df.ESCS=="NA"]=None
df.ESCS = df.ESCS.astype(float)
df.ST57Q01[df.ST57Q01=="NA"]=None
df.ST57Q01 = df.ST57Q01.astype(float)
return df
def reduceData(self):
'''
Exclude all rows and columns that will be not used in the study
'''
#load data
df2 = self.getPisa()
#exclude columns
df2.drop(["BELONG", "OUTHOURS", "PV1READ", "PV1SCIE", "ST28Q01", "CNT",
"STIDSTD"], axis=1, inplace = True)
#exclude rows
df2.drop(df2[df2.isnull().sum(axis=1)>0].index, inplace=True)
return df2
def bucketizePisa(self):
'''
Create buckets to the aaa columns in df_pisa and returns a new dataframe
with these data included
'''
#load data
df = self.getPisa()
#create bins limites to the ESCS data
a_heights, a_bins = np.histogram(df.ESCS[~df['ESCS'].isnull()],bins = 7)
#bucketize ESCS (Social satus)
df["ESCS_bk2"] = None
df["ESCS_bk2"] = pd.cut(df.ESCS, bins= a_bins)
#bucketize ST57Q01 (time spent)
l_bins = [0, 2,4,5,7,8,10,12,15, 30]
df["ST57Q01_bk"] = None
df["ST57Q01_bk"] =pd.cut(df.ST57Q01, bins = l_bins)
return df
def _createGroupedData(self):
'''
Group data by country and take the meah of math scores
'''
#group data
df = self.getPisa()
df_continents = self.getContinents()
df_grouped = df.groupby("CNT1").mean()
#insert the continent of each country in the dataframe
df_grouped["continent"] = None
df_grouped["continent"] = df_continents.loc[df_grouped.index].values
return df_grouped
def _loadData(self, l_fields):
'''
Return a dataframe of the PISA data desired
'''
with open(self.s_file, 'rb') as f1:
zfile = zipfile.ZipFile(f1)
l_rtn = []
with zfile.open(zfile.filelist[0].filename) as f:
reader = csv.DictReader(f)
for idx, row in enumerate(reader):
#count lines processed
if idx%50000==0: print "estou na linha {}".format(idx)
#get data desired
l_rtn.append(_getFields(l_fields, row))
print "estou na linha {}".format(idx)
df = pd.DataFrame(l_rtn)
return df
def _loadGDP(self, s_gdp):
'''
return a dataframe with the GDP file content filtered by the last year
'''
df_gdp = pd.read_csv(s_gdp, sep = "\t")
df_gdp = df_gdp.loc[df_gdp["Year"]=="PISA 2012"]
return df_gdp
def _loadContinents(self, s_continents):
'''
Return a dataframe with the Continent dataset
'''
df_continents = pd.read_csv("data/continents.csv", sep = "\t")
df_continents.index=df_continents.CNT1.values
df_continents.drop(["CNT1"], axis=1, inplace=True)
return df_continents
def _imputGDPinPisa(self, df_pisa, df_gdp):
'''
Return the dataframes passed with new columns included. Insert GDP
informationin df_pisa and a column with new names. Insert new names in
df_gdp
'''
#load data
df_pisa = self.getPisa()
df_gdp = self.getGDP()
#correct names in df_pisa
df_pisa["CNT1"] = None
df_pisa["CNT1"] = rename_contries(list(df_pisa.CNT))
#correct names in df_gdp
df_gdp["country1"] = None
df_gdp["country1"] = rename_contries(list(df_gdp.country))
df_gdp.index = list(df_gdp.country1)
#insert GDP infos in PISA
df_pisa["perCaptaGDP"]=None
l_aux = list(df_gdp.loc[df_pisa["CNT1"]]["Per capita GDP"].values)
df_pisa["perCaptaGDP"] = l_aux
return df_pisa, df_gdp