# Actual Joining of Data

We will read in Libby and Getnet's data, join it together, export as a CSV

In [1]:
import pandas as pd
import re
import json
import csv

In [27]:
# Read in calls_money
calls_money = pd.read_table("../data/calls_money", sep= ",", index_col=False)
calls_money["ward"] = pd.to_numeric(calls_money["ward"])

calls_money

Unnamed: 0,year,category,ward,calls,num_projects,total_cost
0,2012,Beautification,2,0,26,186900
1,2012,Beautification,3,0,3,15500
2,2012,Beautification,4,0,1,800
3,2012,Beautification,24,0,1,33500
4,2012,Beautification,25,0,3,125473
...,...,...,...,...,...,...
3610,2025,Streets & Transportation,46,275,0,0
3611,2025,Streets & Transportation,47,542,0,0
3612,2025,Streets & Transportation,48,267,0,0
3613,2025,Streets & Transportation,49,357,0,0


In [3]:
full_wards = []

with open("../data/wiki/libbys_scraped_data.json", "r") as f:
    df = json.load(f)
    for row in df:
        full_wards.append(row)
        
with open("../data/wiki/getnet_data.json", "r") as f:
    df = json.load(f)
    for row in df:
        full_wards.append(row)
        
all_alderpeople = pd.DataFrame(full_wards)

all_alderpeople

Unnamed: 0,Ward,Alderperson,Start Date,End Date,Party,Notes
0,5,Paul Howard Douglas,1939,1942,,
1,5,Leon Despres,1955,1975,,
2,5,Leslie Hairston,May 1999,"May 15, 2023",,
3,6,Eugene Sawyer,"February 28, 1971","December 2, 1987",,
4,6,Freddrenna Lyle,"February 8, 1998","May 15, 2011",,
...,...,...,...,...,...,...
199,49th,David Orr,1979,1990,Democratic,
200,49th,Joe Moore,1991,"May 20, 2019",Democratic,
201,49th,Maria Hadden,"May 20, 2019",present,,
202,50th,Bernard Stone,1973,2011,Democratic,


Okay. There are a couple of things we have to do to streamline our data. 
1) ensure there is only the year in the start and end dates
2) ensure there is only the number in the ward
3) filter data so we are only working with people between 2012 and 2023
    - so if end date is none or present
    - if end date is between 2012 and 2023

In [4]:
all_alderpeople["Clean Ward"] = all_alderpeople["Ward"].str.extract('(\\d+)')
all_alderpeople["Start Year"] = all_alderpeople["Start Date"].str.extract('(\\d{4})')
all_alderpeople["End Year"] =  all_alderpeople["End Date"].str.extract('(\\d{4})')
all_alderpeople["Start Year"] =  pd.to_numeric(all_alderpeople['Start Year'])
all_alderpeople["End Year"] =  pd.to_numeric(all_alderpeople['End Year'])


In [5]:
alders_2012_2023 =  all_alderpeople.loc[(all_alderpeople['End Year'] >= 2012) | (all_alderpeople['End Year'].isna()) ]
alders_2012_2023 = alders_2012_2023.sort_values(by = "Clean Ward")

In [6]:
alders_2012_2023.to_csv('../data/wiki/alders_2012_2023.csv')

## Check data

Ideally, this would be done before I export the data, but I'm going in reverse for the sake of showing a joined dataset. 

In [7]:
alders_2012_2023["End Year for Fill"] =  alders_2012_2023["End Year"]
alders_2012_2023["End Year for Fill"] =  alders_2012_2023["End Year for Fill"].fillna(2023)
aldermen_year_check = alders_2012_2023[0:1]

# for year in between Start Year and End Year for Fill
# add a column called year in office
# add row with same value but for every year served

for index, row in alders_2012_2023.iterrows():
    start = int(row["Start Year"])
    end = int(row["End Year for Fill"])
    for year in range(start, end +1):
        new_row = pd.DataFrame([{'Ward': row["Ward"], \
            'Alderperson': row["Alderperson"], \
            'Start Date': row["Start Date"],
            'End Date': row["End Date"],
            'Party': row["Party"],
            'Notes': row["Notes"],
            'Clean Ward': row["Clean Ward"],
            'Start Year': row["Start Year"],
            'End Year': row["End Year"],
            'End Year for Fill': int(year),
            }])
        aldermen_year_check = pd.concat([aldermen_year_check, new_row], ignore_index=True)
        
aldermen_year_check

Unnamed: 0,Ward,Alderperson,Start Date,End Date,Party,Notes,Clean Ward,Start Year,End Year,End Year for Fill
0,1st,Proco Joe Moreno,"March 26, 2010","May 20, 2019",Democratic,,1,2010,2019.0,2019.0
1,1st,Proco Joe Moreno,"March 26, 2010","May 20, 2019",Democratic,,1,2010,2019.0,2010.0
2,1st,Proco Joe Moreno,"March 26, 2010","May 20, 2019",Democratic,,1,2010,2019.0,2011.0
3,1st,Proco Joe Moreno,"March 26, 2010","May 20, 2019",Democratic,,1,2010,2019.0,2012.0
4,1st,Proco Joe Moreno,"March 26, 2010","May 20, 2019",Democratic,,1,2010,2019.0,2013.0
...,...,...,...,...,...,...,...,...,...,...
1063,9,Anthony Beale,May 1999,present,,,9,1999,,2019.0
1064,9,Anthony Beale,May 1999,present,,,9,1999,,2020.0
1065,9,Anthony Beale,May 1999,present,,,9,1999,,2021.0
1066,9,Anthony Beale,May 1999,present,,,9,1999,,2022.0


Filter rows that have end year between 2012 and 2023 
Group by ward?
Count

In [8]:
just_12_and_above = aldermen_year_check.loc[(aldermen_year_check['End Year for Fill'] >= 2012)]

just_12_and_above.groupby("Clean Ward").size()

Clean Ward
1     14
10    13
11    14
12    14
13    12
14    12
15    14
16    10
17    13
18     9
19    12
2     13
20    13
21    12
22    13
23    13
24     8
25    13
26    12
27    12
28    12
29    13
3     12
30    12
31    14
32    12
33    14
34    13
35    13
36    18
37    12
38    13
39    13
4     13
40    13
41    13
42    12
43    13
44    13
45    13
46    13
47    13
48    13
49    13
5     12
50    12
6     12
7     14
8     12
9     12
dtype: int64

In [9]:
#just_18_and_above = aldermen_year_check.loc[(aldermen_year_check['End Year for Fill'] >= 2018)]

In [10]:
#just_18_and_above["clean ward"] = pd.to_numeric(just_18_and_above['Clean Ward'])
#all_alderpeople["Start Year"] = pd.to_numeric(all_alderpeople['Start Year'])

In [11]:
#x = just_18_and_above.groupby("Clean Ward").size().reset_index(name='count')

#x = pd.DataFrame(x)
#x.loc[(x['count'] < 6)]

**Libby update, Feb. 23 @ 11:30 p.m.**

Okay, we have good news and bad news. <br>
<br>
Good news is that we are mostly okay, especially after I remembered that 311 data only goes back until 2018, so we don't need to find aldermen between 2012 and 2017 ... I think.  <br>
<br>
There is one error, which is that in the 24th ward, Michael Scott's sister succeeds him, but she's not listed as a bullet point in the big list. BUT she is listed as his successor and there is a link to her information on her page.  <br>
<br>
So thus, I'll need to add a step to seline_wiki_scrape in which I check out the successor heading in the table. If the name under that does not equal the next row in the dictionary (or the name you're checking is the last item in the dictionary), create a dictionary called leftovers. Add the ward number as the key, a list of tuples as the value. The tuple will contain the name of the person and the link associated with them. 

In [13]:
type(aldermen_year_check['Clean Ward'])

calls_money['year']=calls_money['year'].astype(int)
calls_money['ward']=calls_money['ward'].astype(int)

aldermen_year_check['Clean Ward']=aldermen_year_check['Clean Ward'].astype(int)
aldermen_year_check['End Year for Fill']=aldermen_year_check['End Year for Fill'].astype(int)

In [20]:
calls_menu_man = pd.merge(aldermen_year_check, calls_money, left_on=["Clean Ward", "End Year for Fill"], right_on=["ward", "year"])

calls_menu_man = calls_menu_man.loc[(calls_menu_man['End Year for Fill'] >= 2018)]

In [None]:
calls_menu_man_clean = calls_menu_man.loc[:, ['Clean Ward', 'Alderperson', 'Start Date', 'End Date', 'End Year for Fill', 'year', 'category', 'calls', 'num_projects', 'total_cost']]

calls_menu_man_clean= calls_menu_man_clean.sort_values("year")

calls_money.reset_index(drop=True)

calls_money


Unnamed: 0,year,category,ward,calls,num_projects,total_cost
0,2012,Beautification,2,0,26,186900
1,2012,Beautification,3,0,3,15500
2,2012,Beautification,4,0,1,800
3,2012,Beautification,24,0,1,33500
4,2012,Beautification,25,0,3,125473
...,...,...,...,...,...,...
3610,2025,Streets & Transportation,46,275,0,0
3611,2025,Streets & Transportation,47,542,0,0
3612,2025,Streets & Transportation,48,267,0,0
3613,2025,Streets & Transportation,49,357,0,0


In [23]:
calls_menu_man_clean.to_csv("../data/calls_menu_man.csv")