In [1]:
from google.cloud import bigquery
import numpy as np
import pandas as pd

In [27]:
non_standard_sen = ["AP_Census_2008_2020",  
                    "Spring_Census_2011_Disability",  
                    "Spring_Census_2012_Disability",
                    "PRU_Census_2010_to_2013"]
def return_table_variables(database_name, filter=False, skip_tables=non_standard_sen):
    client = bigquery.Client()
    dataset_ref = client.dataset(database_name)
    tables = client.list_tables(dataset_ref)
    table_variables = {}
    for table in tables:
        if filter and filter not in table.table_id.lower():
            continue
        if skip_tables and  table.table_id in skip_tables:
            continue
        table_variables[table.table_id] = []
        schema = client.get_table(table).schema
        for field in schema:
            table_variables[table.table_id].append(field.name)
    return table_variables
            
            
table_variables = return_table_variables("CB_RAW_CENSUS")

In [None]:
1509 + 3539 + 

In [17]:
all_vars = [variable for table_vars in table_variables.values() for variable in table_vars]

In [18]:
from collections import Counter
Counter(all_vars).most_common()

[('edrn', 43),
 ('pupil_matching_ref_anonymous', 43),
 ('record_status', 43),
 ('academic_year', 43),
 ('census_date', 43),
 ('census_term', 43),
 ('la', 43),
 ('estab', 43),
 ('la_estab', 43),
 ('urn', 43),
 ('gender', 43),
 ('age_at_start_of_academic_year', 43),
 ('month_part_of_age_at_start_of_academic_year', 43),
 ('year_of_birth', 43),
 ('month_of_birth', 43),
 ('fs_meligible', 43),
 ('enrol_status', 43),
 ('entry_date', 43),
 ('leaving_date', 43),
 ('part_time', 43),
 ('boarder', 43),
 ('n_cyear_actual', 43),
 ('se_nprovision', 43),
 ('language', 40),
 ('type_of_class', 37),
 ('language_group_minor', 37),
 ('language_group_major', 37),
 ('locality', 36),
 ('town', 36),
 ('administrative_area', 36),
 ('on_roll', 34),
 ('se_nprovision_major', 34),
 ('hours_at_setting', 31),
 ('funded_hours', 28),
 ('la_9_code', 28),
 ('idaci_score', 27),
 ('idaci_rank', 27),
 ('ethnicity', 25),
 ('yssa', 22),
 ('oa11', 22),
 ('lsoa11', 22),
 ('connexions', 21),
 ('coa', 19),
 ('llsoa', 19),
 ('ethn

In [46]:
client = bigquery.Client()
dataset_ref = client.dataset("CB_RAW_CENSUS")
tables = client.list_tables(dataset_ref)
table_variables = {}
fsm_df = pd.DataFrame() 
for table in tables:
    if table.table_id in non_standard_sen:
        continue
    cols = "edrn, academic_year, census_term, fs_meligible"
    df = pd.read_gbq(f"SELECT {cols} FROM CB_RAW_CENSUS.{table.table_id}")
    fsm_df = pd.concat([fsm_df, df])

In [47]:
fsm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5473880 entries, 0 to 135617
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   edrn           object
 1   academic_year  object
 2   census_term    object
 3   fs_meligible   Int64 
dtypes: Int64(1), object(3)
memory usage: 214.0+ MB


In [52]:
%%bigquery fdm_census
SELECT EDRN, AcademicYear, CensusTerm, FSMEligible 
FROM `yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation.src_census` 

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 683.89query/s] 
Downloading: 100%|██████████| 5955732/5955732 [00:14<00:00, 412181.10rows/s] 


In [53]:
fdm_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5955732 entries, 0 to 5955731
Data columns (total 4 columns):
 #   Column        Dtype  
---  ------        -----  
 0   EDRN          object 
 1   AcademicYear  object 
 2   CensusTerm    object 
 3   FSMEligible   boolean
dtypes: boolean(1), object(3)
memory usage: 147.7+ MB


In [54]:
fdm_census.rename(columns={
    "EDRN": "edrn",
    "AcademicYear": "academic_year",
    "CensusTerm": "census_term",
    "FSMEligible": "fs_meligible_census"
}, inplace=True)

In [56]:
merged_df = fsm_df.merge(fdm_census, on=["edrn", "academic_year", "census_term"])

In [62]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5477686 entries, 0 to 5477685
Data columns (total 5 columns):
 #   Column               Dtype  
---  ------               -----  
 0   edrn                 object 
 1   academic_year        object 
 2   census_term          object 
 3   fs_meligible         boolean
 4   fs_meligible_census  boolean
dtypes: boolean(2), object(3)
memory usage: 188.1+ MB


In [61]:
merged_df["fs_meligible"] = merged_df.fs_meligible.astype("boolean")

In [68]:
merged_df[merged_df.fs_meligible != merged_df.fs_meligible_census].groupby("edrn").count().academic_year.value_counts()

2     286
4      17
1      13
6       2
20      1
12      1
Name: academic_year, dtype: int64