# Imports 

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Get the data

In [2]:
CHOSEN_SHEETS = ["England", "Scotland", "Wales", "Northern Ireland"]

summary_1819 = pd.read_excel(r"data/summary_data_2018_2019.xls", skiprows = 5, nrows = 37, sheet_name = CHOSEN_SHEETS) #skip nhs placement questions
summary_2122 = pd.read_excel(r"data/summary_data_2021_2022.xls", skiprows = 5, nrows = 37, sheet_name = CHOSEN_SHEETS) #skip nhs placement questions

summary_1819

{'England':                                            Unnamed: 0  % agree  Responses  \
 0                           The teaching on my course      NaN        NaN   
 1            1 - Staff are good at explaining things.     89.0   251764.0   
 2        2 - Staff have made the subject interesting.     82.0   251586.0   
 3       3 - The course is intellectually stimulating.     85.0   251659.0   
 4   4 - My course has challenged me to achieve my ...     81.0   251723.0   
 5                              Learning opportunities      NaN        NaN   
 6   5 - My course has provided me with opportuniti...     84.0   251730.0   
 7   6 - My course has provided me with opportuniti...     85.0   251561.0   
 8   7 - My course has provided me with opportuniti...     81.0   251519.0   
 9                             Assessment and feedback      NaN        NaN   
 10  8 - The criteria used in marking have been cle...     72.0   251630.0   
 11          9 - Marking and assessment has been fair

In [3]:
summary_1819.get("England")

Unnamed: 0.1,Unnamed: 0,% agree,Responses,% agree.1,Responses.1,% agree.2,Responses.2,% agree.3,Responses.3
0,The teaching on my course,,,,,,,,
1,1 - Staff are good at explaining things.,89.0,251764.0,86.0,19630.0,89.0,259777.0,86.0,21234.0
2,2 - Staff have made the subject interesting.,82.0,251586.0,82.0,19661.0,82.0,259593.0,80.0,21227.0
3,3 - The course is intellectually stimulating.,85.0,251659.0,89.0,20002.0,84.0,259704.0,88.0,21553.0
4,4 - My course has challenged me to achieve my ...,81.0,251723.0,84.0,19993.0,81.0,259749.0,84.0,21536.0
5,Learning opportunities,,,,,,,,
6,5 - My course has provided me with opportuniti...,84.0,251730.0,85.0,19980.0,84.0,259722.0,85.0,21538.0
7,6 - My course has provided me with opportuniti...,85.0,251561.0,84.0,19931.0,85.0,259638.0,83.0,21501.0
8,7 - My course has provided me with opportuniti...,81.0,251519.0,79.0,19792.0,81.0,259525.0,80.0,21361.0
9,Assessment and feedback,,,,,,,,


# Cleaning

In [4]:
def weighted_avg(file, idx, left_side):
    if left_side:
        return (file.iloc[idx][1] * file.iloc[idx][2] + file.iloc[idx][3] * file.iloc[idx][4]) / (file.iloc[idx][2]+ file.iloc[idx][4])
    return (file.iloc[idx][5] * file.iloc[idx][6] + file.iloc[idx][7] * file.iloc[idx][8]) / (file.iloc[idx][6]+ file.iloc[idx][8])
    
def clean(file, sheet_name, left_side):
    #optimise so that first columns are dropped after usage so left_side not needed, and copy data so can be run multiple times
    df = file.get(sheet_name).copy()
    df.dropna(axis = "rows", inplace = True) #drop empty rows - just text
    df.reset_index(inplace = True, drop = True) #reset index

    average_pct = []
    response_counts = []
    for i in range(len(df)):
        average_pct.append(weighted_avg(df, i, left_side))
        if left_side:
            response_counts.append(df.iloc[i][2] + df.iloc[i][4])
        else:
            response_counts.append(df.iloc[i][6] + df.iloc[i][8]) 
        
    df.drop(df.columns[[1,2,3,4,5,6,7,8]], axis=1, inplace=True) #drop columns, no longer needed
    df["score"] = average_pct #add score as feature
    
    df = df.T #set transpose as we want Qs af features
    df.rename(columns= lambda x: "Q"+str(x+1), inplace = True) #rename
    df.drop("Unnamed: 0", inplace = True, axis = "rows") #drop question row, now in header
    df.rename(index = {"score": sheet_name}, inplace = True) #keep track of rows
    df["avg_res"] = sum(response_counts)/df.shape[1] #add respsonse count
    
    return df

In [5]:
df_18 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(summary_1819, sheet, True)
    df_18 = pd.concat([df_18, temp_df])
    
df_21 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(summary_2122, sheet, True)
    df_21 = pd.concat([df_21, temp_df])

df_19 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(summary_1819, sheet, False)
    df_19 = pd.concat([df_19, temp_df])
    
df_22 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(summary_2122, sheet, False)
    df_22 = pd.concat([df_22, temp_df])
    
#take average of two years
summary_1819 = (df_18 + df_19) / 2
summary_2122 = (df_21 + df_22) / 2

#add total row by getting weighted avg and sum of repsonses
values_1819 = []
for i in range(summary_1819.shape[1]-1):
    values_1819.append(sum(summary_1819.iloc[:,i] * summary_1819.iloc[:,-1]) / sum(summary_1819.iloc[:,-1]))
values_1819.append(sum(summary_1819.iloc[:,-1]))

values_2122 = []
for i in range(summary_2122.shape[1]-1):
    values_2122.append(sum(summary_2122.iloc[:,i] * summary_2122.iloc[:,-1]) / sum(summary_2122.iloc[:,-1]))
values_2122.append(sum(summary_2122.iloc[:,-1]))

summary_1819.loc["UK"] = np.array(values_1819)
summary_2122.loc["UK"] = np.array(values_2122)

# Save and load data

In [6]:
summary_1819.to_pickle("data/summary_1819.pkl")
summary_2122.to_pickle("data/summary_2122.pkl")

In [7]:
pd.read_pickle("data/summary_1819.pkl")

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,avg_res
England,88.77816,81.924411,84.800519,81.225215,84.075056,84.886815,80.889026,72.525879,72.712395,74.60084,...,86.632812,86.817193,68.538242,83.659492,83.902753,75.447773,60.91548,55.902584,83.150482,274799.296296
Scotland,89.457671,83.012992,85.54163,80.55478,83.527915,84.040667,78.568853,71.681497,73.16735,65.166609,...,87.430486,87.890169,67.254589,85.835967,86.415676,74.458033,55.403336,52.364461,83.527527,24350.777778
Wales,89.935977,83.503667,84.982081,82.014255,84.407333,85.421609,82.467954,75.067815,75.181148,75.042689,...,86.807669,87.224629,71.852159,85.437866,86.321741,78.964212,64.790899,58.695526,84.920995,16079.574074
Northern Ireland,89.024086,82.205276,85.253734,83.277861,83.700776,85.20134,84.200163,73.878637,74.954569,68.803944,...,88.12664,88.126266,68.900289,88.274402,83.822866,74.552509,59.371378,57.498343,84.522136,7774.462963
UK,88.892944,82.091855,84.876337,81.263357,84.04134,84.857219,80.87241,72.621323,72.923558,73.772077,...,86.737608,86.949874,68.615155,84.02318,84.210696,75.526663,60.655686,55.813295,83.30006,323004.111111


In [8]:
pd.read_pickle("data/summary_2122.pkl")

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q29,Q30,Q31,Q32,Q33,Q34,Q35,Q36,Q37,avg_res
England,79.737075,84.147697,77.662464,81.305213,75.903553,79.071665,79.465628,81.224499,76.59048,68.78683,...,74.858694,66.321623,78.887684,68.548402,51.533327,52.771014,52.771014,75.427824,75.427824,277392.405405
Scotland,82.109712,86.462607,80.596142,83.84219,77.596963,79.973417,81.229188,82.6085,76.144778,65.740663,...,77.515323,66.992655,84.077905,68.420797,48.336113,52.010475,52.010475,79.074508,79.074508,25740.797297
Wales,80.525789,85.011885,79.146727,81.53957,76.456929,79.132272,79.015333,81.310633,77.152465,69.522817,...,75.244072,68.246707,80.894096,69.954157,53.914158,55.46916,55.46916,76.422458,76.422458,15153.256757
Northern Ireland,81.095008,85.277551,77.698817,82.520124,78.952124,81.038071,80.047217,82.278878,80.933878,67.921884,...,80.587072,67.513943,79.152724,69.315699,54.198072,56.365123,56.365123,79.418958,79.418958,7575.351351
UK,79.992741,84.397011,77.96407,81.544757,76.133924,79.191428,79.597518,81.362342,76.682378,68.560322,...,75.219638,66.491868,79.397139,68.62153,51.45343,52.919959,52.919959,75.854922,75.854922,325861.810811
