In [23]:
import pandas as pd
import utils
import matplotlib.pyplot as plt
import seaborn as sns

## DATASET

In [24]:
Economic = pd.read_csv("../data/EconomicData_ZCTAs.csv")
Demographic = pd.read_csv("../data/DemographicData_ZCTAs.csv")

## CLEANING

In [25]:
rename_col_Economic = {
    'id' : 'Id',
    'Geographic Area Name' : 'GeographicAreaName',
    'TotalHouseholds_LessThan$10.000' : 'Less than 10k',
    'TotalHouseholds_$10.000to$14.999' : '10 to 14.9k',
    'TotalHouseholds_$15.000to$24.999' : '15 to 24.9k',
    'TotalHouseholds_$25.000to$34.999' : '25 to 34.9k',
    'TotalHouseholds_$35.000to$49.999' : '35 to 49.9k',
    'TotalHouseholds_$50.000to$74.999' : '50 to 74.9k',
    'TotalHouseholds_$75.000to$99.999' : '75 to 99.9k',
    'TotalHouseholds_$100.000to$149.999' : '100 to 149.9k',
    'TotalHouseholds_$150.000to$199.999' : '150 to 199.9k',
    'TotalHouseholds_$200.000OrMore' : '200k or more'      
}

Economic = Economic.rename(columns=rename_col_Economic)

In [26]:
rename_col_demographic = {
    'Population_Under5Years': 'Under 5',
    'Population_5to9Years': '5 to 9',
    'Population_10to14Years': '10 to 14',
    'Population_15to19Years': '15 to 19',
    'Population_20to24Years': '20 to 24',
    'Population_25to34Years': '25 to 34',
    'Population_35to44Years': '35 to 44',
    'Population_45to54Years': '45 to 54',
    'Population_55to59Years': '55 to 59',
    'Population_60to64Years': '60 to 64',
    'Population_65to74Years': '65 to 74',
    'Population_75to84Years': '75 to 84',
    'Population_85YearsAndOver': '85 and Over'
}

Demographic = Demographic.rename(columns=rename_col_demographic)

### ECONOMIC

In [27]:
Economic = Economic.drop(columns=['Unnamed: 0'])
Demographic = Demographic.drop(columns=['Unnamed: 0'])

In [28]:
# Para esse estudo as linhas repetidas foram consideradas duplicadas e não lançamentos multiplos
Economic =  Economic.drop_duplicates()

In [29]:
Economic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33120 entries, 0 to 33119
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Id                  33120 non-null  object
 1   GeographicAreaName  33120 non-null  object
 2   Less than 10k       33120 non-null  int64 
 3   10 to 14.9k         33120 non-null  int64 
 4   15 to 24.9k         33120 non-null  int64 
 5   25 to 34.9k         33120 non-null  int64 
 6   35 to 49.9k         33120 non-null  int64 
 7   50 to 74.9k         33120 non-null  int64 
 8   75 to 99.9k         33120 non-null  int64 
 9   100 to 149.9k       33120 non-null  int64 
 10  150 to 199.9k       33120 non-null  int64 
 11  200k or more        33120 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 3.3+ MB


### DEMOGRAPHIC

In [30]:
# Verifica-se que há Zip Code sem dados que foram mapeados a partir da coluna 'SexRatio(males per 100 females)' uma vez que a media pode ser vazia mesmo havendo faixa etárias com saldo > 0
Demographic = Demographic.dropna(subset=['SexRatio(males per 100 females)'])

In [31]:
Demographic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32620 entries, 0 to 33119
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Id                               32620 non-null  object 
 1   GeographicAreaName               32620 non-null  object 
 2   TotalPopulation                  32620 non-null  int64  
 3   SexRatio(males per 100 females)  32620 non-null  float64
 4   Under 5                          32620 non-null  int64  
 5   5 to 9                           32620 non-null  int64  
 6   10 to 14                         32620 non-null  int64  
 7   15 to 19                         32620 non-null  int64  
 8   20 to 24                         32620 non-null  int64  
 9   25 to 34                         32620 non-null  int64  
 10  35 to 44                         32620 non-null  int64  
 11  45 to 54                         32620 non-null  int64  
 12  55 to 59               

In [32]:
print(f"Economic Zip Code: {len(Economic['GeographicAreaName'])}, valores únicos: {len(Economic['GeographicAreaName'].unique())}. Difereça: {len(Economic['GeographicAreaName']) - len(Economic['GeographicAreaName'].unique())}")
print(f"Demographic Zip Code: {len(Demographic['GeographicAreaName'])}, valores únicos: {len(Demographic['GeographicAreaName'].unique())}. Difereça: {len(Demographic['GeographicAreaName']) - len(Demographic['GeographicAreaName'].unique())}")

Economic Zip Code: 33120, valores únicos: 33120. Difereça: 0
Demographic Zip Code: 32620, valores únicos: 32620. Difereça: 0


### MERGE DATASETS

In [33]:
df = pd.merge( Demographic, Economic, how="left", on=['Id', 'GeographicAreaName'])
df.head()

Unnamed: 0,Id,GeographicAreaName,TotalPopulation,SexRatio(males per 100 females),Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,...,Less than 10k,10 to 14.9k,15 to 24.9k,25 to 34.9k,35 to 49.9k,50 to 74.9k,75 to 99.9k,100 to 149.9k,150 to 199.9k,200k or more
0,8600000US35004,ZCTA5 35004,12045,94.1,805,1075,898,477,578,2088,...,198,71,298,513,647,1117,529,945,245,61
1,8600000US35005,ZCTA5 35005,7344,86.1,504,453,511,499,214,788,...,188,184,318,293,353,562,299,407,67,26
2,8600000US35006,ZCTA5 35006,2883,108.2,96,153,303,129,156,183,...,71,20,117,104,154,176,124,194,51,7
3,8600000US35007,ZCTA5 35007,26332,95.0,1936,1992,1837,1762,1376,3119,...,396,208,670,462,1173,1854,1578,2224,473,254
4,8600000US35010,ZCTA5 35010,20613,90.5,1306,1465,944,1217,1128,2513,...,700,610,1093,957,1056,1512,807,749,254,249


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32620 entries, 0 to 32619
Data columns (total 28 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Id                               32620 non-null  object 
 1   GeographicAreaName               32620 non-null  object 
 2   TotalPopulation                  32620 non-null  int64  
 3   SexRatio(males per 100 females)  32620 non-null  float64
 4   Under 5                          32620 non-null  int64  
 5   5 to 9                           32620 non-null  int64  
 6   10 to 14                         32620 non-null  int64  
 7   15 to 19                         32620 non-null  int64  
 8   20 to 24                         32620 non-null  int64  
 9   25 to 34                         32620 non-null  int64  
 10  35 to 44                         32620 non-null  int64  
 11  45 to 54                         32620 non-null  int64  
 12  55 to 59          

## VERIFICAR OS ZIPCODES DAS CLINICAS

In [35]:
labs = pd.read_csv("../data/df_geocode.csv")
labs.head() 

Unnamed: 0.1,Unnamed: 0,Lab Id,Address,Location,Zipcode
0,0,L152,"3800 PLEASANT HILL RD STE 1, DULUTH, GA 30096","34.000319,-84.1629724",30096.0
1,1,L520,"1614 N JAMES ST, ROME, NY 13440","43.2311327,-75.4445363",13440.0
2,2,L141,"12911 120TH AVE NE STE D60, KIRKLAND, WA 98034","47.7162786,-122.1838152",98034.0
3,3,L524,"5667 PEACHTREE DUNWOODY RD 250, ATLANTA, GA 30342","33.9093875,-84.3529096",30342.0
4,4,L545,"1204 IL HWY 164, OQUAWKA, IL 61469","40.9309925,-90.9437598",61469.0


In [36]:
labs = labs.dropna(subset=['Zipcode'])

In [37]:
labs['teste'] = labs['Zipcode'].astype(int)

In [38]:
labs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118 entries, 0 to 118
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  118 non-null    int64  
 1   Lab Id      118 non-null    object 
 2   Address     118 non-null    object 
 3   Location    118 non-null    object 
 4   Zipcode     118 non-null    float64
 5   teste       118 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 6.5+ KB


In [39]:
len(labs['Zipcode'].unique())

105

In [40]:
df['Zipcode'] = df['GeographicAreaName'].apply(lambda x: x[-5:])
df['Zipcode'] = pd.to_numeric(df['Zipcode'])
df.head()

Unnamed: 0,Id,GeographicAreaName,TotalPopulation,SexRatio(males per 100 females),Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,...,10 to 14.9k,15 to 24.9k,25 to 34.9k,35 to 49.9k,50 to 74.9k,75 to 99.9k,100 to 149.9k,150 to 199.9k,200k or more,Zipcode
0,8600000US35004,ZCTA5 35004,12045,94.1,805,1075,898,477,578,2088,...,71,298,513,647,1117,529,945,245,61,35004
1,8600000US35005,ZCTA5 35005,7344,86.1,504,453,511,499,214,788,...,184,318,293,353,562,299,407,67,26,35005
2,8600000US35006,ZCTA5 35006,2883,108.2,96,153,303,129,156,183,...,20,117,104,154,176,124,194,51,7,35006
3,8600000US35007,ZCTA5 35007,26332,95.0,1936,1992,1837,1762,1376,3119,...,208,670,462,1173,1854,1578,2224,473,254,35007
4,8600000US35010,ZCTA5 35010,20613,90.5,1306,1465,944,1217,1128,2513,...,610,1093,957,1056,1512,807,749,254,249,35010


In [41]:
df['isLab'] = df['Zipcode'].apply(lambda x: True if x in labs['Zipcode'].tolist() else False)

In [42]:
df.head()

Unnamed: 0,Id,GeographicAreaName,TotalPopulation,SexRatio(males per 100 females),Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,...,15 to 24.9k,25 to 34.9k,35 to 49.9k,50 to 74.9k,75 to 99.9k,100 to 149.9k,150 to 199.9k,200k or more,Zipcode,isLab
0,8600000US35004,ZCTA5 35004,12045,94.1,805,1075,898,477,578,2088,...,298,513,647,1117,529,945,245,61,35004,False
1,8600000US35005,ZCTA5 35005,7344,86.1,504,453,511,499,214,788,...,318,293,353,562,299,407,67,26,35005,False
2,8600000US35006,ZCTA5 35006,2883,108.2,96,153,303,129,156,183,...,117,104,154,176,124,194,51,7,35006,False
3,8600000US35007,ZCTA5 35007,26332,95.0,1936,1992,1837,1762,1376,3119,...,670,462,1173,1854,1578,2224,473,254,35007,False
4,8600000US35010,ZCTA5 35010,20613,90.5,1306,1465,944,1217,1128,2513,...,1093,957,1056,1512,807,749,254,249,35010,False


## SALVAR O DATASET PROCESSADO

In [43]:
df.describe()

Unnamed: 0,TotalPopulation,SexRatio(males per 100 females),Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,...,10 to 14.9k,15 to 24.9k,25 to 34.9k,35 to 49.9k,50 to 74.9k,75 to 99.9k,100 to 149.9k,150 to 199.9k,200k or more,Zipcode
count,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,...,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0,32620.0
mean,10054.024372,116.842275,610.375169,623.294758,647.640313,656.84954,681.800307,1392.667903,1268.561435,1302.908768,...,163.377192,336.522563,335.295156,458.712385,640.985653,472.824831,561.61217,250.888627,284.347486,49657.11591
std,14775.50297,1483.160382,1000.754249,999.871869,1030.595489,1055.807952,1243.279669,2337.849699,2003.216409,1933.310096,...,278.29302,523.279157,505.750589,676.724843,930.783407,699.877753,892.501061,471.952425,689.16029,27513.428859
min,3.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,601.0
25%,751.0,91.3,34.0,37.0,40.0,41.0,33.0,71.0,75.0,92.0,...,10.0,27.0,27.0,39.0,53.0,34.0,31.0,7.0,4.0,26763.75
50%,2924.0,98.4,152.0,166.0,181.0,179.0,156.0,311.0,327.5,381.0,...,47.0,102.0,104.0,144.0,204.0,143.0,142.0,42.0,31.0,49764.5
75%,13773.0,107.9,764.0,799.0,833.0,839.0,801.0,1659.0,1630.0,1771.0,...,192.0,425.25,427.0,594.0,850.0,625.0,698.0,261.0,208.0,72047.25
max,128294.0,265600.0,17564.0,14784.0,12289.0,13357.0,23027.0,28192.0,23166.0,18058.0,...,3942.0,5410.0,4349.0,5988.0,8466.0,6491.0,8705.0,6095.0,11968.0,99929.0


In [44]:
df.to_csv("../data/Regions_cleaned.csv")