# Actual Joining of Data

We will read in Libby and Getnet's data, join it together, export as a CSV

In [9]:
import pandas as pd
pd.set_option('display.max_rows', None)
import re
import json
import csv

In [10]:
# Read in calls_money
calls_money = pd.read_table("../data/calls_money", sep= ",", index_col=False)
calls_money["ward"] = pd.to_numeric(calls_money["ward"])

In [11]:
full_wards = []

with open("../data/wiki/libbys_scraped_data.json", "r") as f:
    df = json.load(f)
    for row in df:
        full_wards.append(row)
        
with open("../data/wiki/getnet_data.json", "r") as f:
    df = json.load(f)
    for row in df:
        full_wards.append(row)
        
all_alderpeople = pd.DataFrame(full_wards)

#all_alderpeople

Okay. There are a couple of things we have to do to streamline our data. 
1) ensure there is only the year in the start and end dates
2) ensure there is only the number in the ward
3) filter data so we are only working with people between 2012 and 2023
    - so if end date is none or present
    - if end date is between 2012 and 2023

In [12]:
all_alderpeople["Clean Ward"] = all_alderpeople["Ward"].str.extract('(\\d+)')
all_alderpeople["Start Year"] = all_alderpeople["Start Date"].str.extract('(\\d{4})')
all_alderpeople["End Year"] =  all_alderpeople["End Date"].str.extract('(\\d{4})')
all_alderpeople["Start Year"] =  pd.to_numeric(all_alderpeople['Start Year'])
all_alderpeople["End Year"] =  pd.to_numeric(all_alderpeople['End Year'])


## Check data

Ideally, this would be done before I export the data, but I'm going in reverse for the sake of showing a joined dataset. 

In [13]:
all_alderpeople["End Year for Fill"] =  all_alderpeople["End Year"]
all_alderpeople["End Year for Fill"] =  all_alderpeople["End Year for Fill"].fillna(2023)
aldermen_year_check = all_alderpeople[0:1]

# for year in between Start Year and End Year for Fill
# add a column called year in office
# add row with same value but for every year served

for index, row in all_alderpeople.iterrows():
    start = int(row["Start Year"])
    end = int(row["End Year for Fill"])
    for year in range(start, end + 1): 
        new_row = pd.DataFrame([{'Ward': row["Ward"], \
            'Alderperson': row["Alderperson"], \
            'Start Date': row["Start Date"],
            'End Date': row["End Date"],
            'Party': row["Party"],
            'Notes': row["Notes"],
            'Clean Ward': row["Clean Ward"],
            'Start Year': row["Start Year"],
            'End Year': row["End Year"],
            'End Year for Fill': int(year),
            }])
        
        aldermen_year_check = pd.concat([aldermen_year_check, new_row], ignore_index=True)
        
aldermen_year_check = aldermen_year_check[1::]


In [15]:
alders_2018_2023 =  aldermen_year_check.loc[(aldermen_year_check['End Year for Fill'] >= 2018) ]

#alders_2018_2023 = alders_2018_2023.drop_duplicates()

alders_2018_2023.reset_index()

Unnamed: 0,index,Ward,Alderperson,Start Date,End Date,Party,Notes,Clean Ward,Start Year,End Year,End Year for Fill
0,45,5,Leslie Hairston,May 1999,"May 15, 2023",,,5,1999,2023.0,2018.0
1,46,5,Leslie Hairston,May 1999,"May 15, 2023",,,5,1999,2023.0,2019.0
2,47,5,Leslie Hairston,May 1999,"May 15, 2023",,,5,1999,2023.0,2020.0
3,48,5,Leslie Hairston,May 1999,"May 15, 2023",,,5,1999,2023.0,2021.0
4,49,5,Leslie Hairston,May 1999,"May 15, 2023",,,5,1999,2023.0,2022.0
5,50,5,Leslie Hairston,May 1999,"May 15, 2023",,,5,1999,2023.0,2023.0
6,89,6,Roderick Sawyer,"May 16, 2011","May 15, 2023",,,6,2011,2023.0,2018.0
7,90,6,Roderick Sawyer,"May 16, 2011","May 15, 2023",,,6,2011,2023.0,2019.0
8,91,6,Roderick Sawyer,"May 16, 2011","May 15, 2023",,,6,2011,2023.0,2020.0
9,92,6,Roderick Sawyer,"May 16, 2011","May 15, 2023",,,6,2011,2023.0,2021.0


In [None]:
year_count = alders_2018_2023.groupby(["Clean Ward","End Year for Fill"]).size().rename("count_by_year").reset_index()

year_count_over = year_count.loc[(year_count['count_by_year'] > 1)]

year_count_over.sort_values(["Clean Ward", "End Year for Fill"])

In [None]:
filtered_for_problems = pd.merge(alders_2018_2023, year_count_over, on=["Clean Ward", "End Year for Fill"])

filtered_for_problems.sort_values(["Clean Ward", "End Year for Fill"])

**Libby update, Feb. 23 @ 11:30 p.m.**

Okay, we have good news and bad news. <br>
<br>
Good news is that we are mostly okay, especially after I remembered that 311 data only goes back until 2018, so we don't need to find aldermen between 2012 and 2017 ... I think.  <br>
<br>
There is one error, which is that in the 24th ward, Michael Scott's sister succeeds him, but she's not listed as a bullet point in the big list. BUT she is listed as his successor and there is a link to her information on her page.  <br>
<br>
So thus, I'll need to add a step to seline_wiki_scrape in which I check out the successor heading in the table. If the name under that does not equal the next row in the dictionary (or the name you're checking is the last item in the dictionary), create a dictionary called leftovers. Add the ward number as the key, a list of tuples as the value. The tuple will contain the name of the person and the link associated with them. 

In [None]:
type(aldermen_year_check['Clean Ward'])

calls_money['year']=calls_money['year'].astype(int)
calls_money['ward']=calls_money['ward'].astype(int)

aldermen_year_check['Clean Ward']=aldermen_year_check['Clean Ward'].astype(int)
aldermen_year_check['End Year for Fill']=aldermen_year_check['End Year for Fill'].astype(int)

In [None]:
calls_menu_man = pd.merge(aldermen_year_check, calls_money, left_on=["Clean Ward", "End Year for Fill"], right_on=["ward", "year"])

calls_menu_man = calls_menu_man.loc[(calls_menu_man['End Year for Fill'] >= 2018)]

In [None]:
calls_menu_man_clean = calls_menu_man.loc[:, ['Clean Ward', 'Alderperson', 'Start Date', 'End Date', 'End Year for Fill', 'year', 'category', 'calls', 'num_projects', 'total_cost']]

calls_menu_man_clean= calls_menu_man_clean.sort_values(["Clean Ward", "year",])

calls_menu_man_clean = calls_menu_man_clean.reset_index(drop=True)

calls_menu_man_clean


In [None]:
calls_menu_man_clean.to_csv("../data/calls_menu_man.csv")