# Actual Joining of Data

We will read in Libby and Getnet's data, join it together, export as a CSV

In [47]:
import pandas as pd
import re
import json
import csv

In [48]:
full_wards = []

with open("../data/wiki/libbys_scraped_data.json", "r") as f:
    df = json.load(f)
    for row in df:
        full_wards.append(row)
        
with open("../data/wiki/getnet_data.json", "r") as f:
    df = json.load(f)
    for row in df:
        full_wards.append(row)
        
all_alderpeople = pd.DataFrame(full_wards)

all_alderpeople

Unnamed: 0,Ward,Alderperson,Start Date,End Date,Party,Notes
0,5,Paul Howard Douglas,1939,1942,,
1,5,Leon Despres,1955,1975,,
2,5,Leslie Hairston,May 1999,"May 15, 2023",,
3,6,Eugene Sawyer,"February 28, 1971","December 2, 1987",,
4,6,Freddrenna Lyle,"February 8, 1998","May 15, 2011",,
...,...,...,...,...,...,...
201,49th,David Orr,1979,1990,Democratic,
202,49th,Joe Moore,1991,"May 20, 2019",Democratic,
203,49th,Maria Hadden,"May 20, 2019",present,,
204,50th,Bernard Stone,1973,2011,Democratic,


Okay. There are a couple of things we have to do to streamline our data. 
1) ensure there is only the year in the start and end dates
2) ensure there is only the number in the ward
3) filter data so we are only working with people between 2012 and 2023
    - so if end date is none or present
    - if end date is between 2012 and 2023

In [49]:
all_alderpeople["Clean Ward"] = all_alderpeople["Ward"].str.extract('(\\d+)')
all_alderpeople["Start Year"] = all_alderpeople["Start Date"].str.extract('(\\d{4})')
all_alderpeople["End Year"] =  all_alderpeople["End Date"].str.extract('(\\d{4})')
all_alderpeople["Start Year"] =  pd.to_numeric(all_alderpeople['Start Year'])
all_alderpeople["End Year"] =  pd.to_numeric(all_alderpeople['End Year'])


In [51]:
alders_2012_2023 =  all_alderpeople.loc[(all_alderpeople['End Year'] >= 2012) | (all_alderpeople['End Year'].isna()) ]
alders_2012_2023 = alders_2012_2023.sort_values(by = "Clean Ward")

In [52]:
alders_2012_2023.to_csv('../data/wiki/alders_2012_2023.csv')

## Check data

Ideally, this would be done before I export the data, but I'm going in reverse for the sake of showing a joined dataset. 

In [56]:
all_alderpeople.loc[, 0]

SyntaxError: invalid syntax (2293335541.py, line 1)

In [None]:
all_alderpeople["End Year for Fill"] =  all_alderpeople["End Year"]
all_alderpeople["End Year for Fill"] =  all_alderpeople["End Year for Fill"].fillna(2023)
aldermen_year_check = pd.DataFrame(alders_2012_2023.iloc[0])

aldermen_year_check

Unnamed: 0,104
Ward,1st
Alderperson,Proco Joe Moreno
Start Date,"March 26, 2010"
End Date,"May 20, 2019"
Party,Democratic
Notes,
Clean Ward,1
Start Year,2010
End Year,2019.0


In [53]:
all_alderpeople["End Year for Fill"] =  all_alderpeople["End Year"]
all_alderpeople["End Year for Fill"] =  all_alderpeople["End Year for Fill"].fillna(2023)
#all_alderpeople["Start Year for Fill"] =  all_alderpeople["Start Year"]
aldermen_year_check = pd.DataFrame(alders_2012_2023.iloc[0])

# for year in between Start Year and End Year for Fill
# add a column called year in office
# add row with same value but for every year served

for index, row in alders_2012_2023.iterrows():
    start = int(row["Start Year"])
    end = int(row["End Year for Fill"])
    for year in range(start, end +1):
        new_row = pd.DataFrame([{'Ward': row["Start Year"], \
            'Alderperson': row["Alderperson"], \
            'Start Date': row["Start Date"],
            'End Date': row["End Date"],
            'Party': row["Party"],
            'Notes': row["Notes"],
            'Clean Ward': row["Clean Ward"],
            'Start Year': row["Start Year"],
            'End Year': row["End Year"],
            'End Year for Fill': int(year),
            }])
        aldermen_year_check = pd.concat([aldermen_year_check, new_row], ignore_index=True)
        
aldermen_year_check

KeyError: 'End Year for Fill'

In [None]:
aldermen_year_check.to_csv('../data/wiki/copy_alders_2012_2023.csv')