# Imports 

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Get the data

In [2]:
os.chdir(r"C:\Users\Áron\Google Drive\UNI\Y4S2\Dissertation\Data")
CHOSEN_SHEETS = ["England", "Scotland", "Wales", "Northern Ireland"]

data_18_19 = pd.read_excel(r"summary_data_2018_2019.xls", skiprows = 5, nrows = 37, sheet_name = CHOSEN_SHEETS) #skip nhs placement questions
data_21_22 = pd.read_excel(r"summary_data_2021_2022.xls", skiprows = 5, nrows = 37, sheet_name = CHOSEN_SHEETS) #skip nhs placement questions

data_18_19

{'England':                                            Unnamed: 0  % agree  Responses  \
 0                           The teaching on my course      NaN        NaN   
 1            1 - Staff are good at explaining things.     89.0   251764.0   
 2        2 - Staff have made the subject interesting.     82.0   251586.0   
 3       3 - The course is intellectually stimulating.     85.0   251659.0   
 4   4 - My course has challenged me to achieve my ...     81.0   251723.0   
 5                              Learning opportunities      NaN        NaN   
 6   5 - My course has provided me with opportuniti...     84.0   251730.0   
 7   6 - My course has provided me with opportuniti...     85.0   251561.0   
 8   7 - My course has provided me with opportuniti...     81.0   251519.0   
 9                             Assessment and feedback      NaN        NaN   
 10  8 - The criteria used in marking have been cle...     72.0   251630.0   
 11          9 - Marking and assessment has been fair

In [3]:
data_18_19.get("England")

Unnamed: 0.1,Unnamed: 0,% agree,Responses,% agree.1,Responses.1,% agree.2,Responses.2,% agree.3,Responses.3
0,The teaching on my course,,,,,,,,
1,1 - Staff are good at explaining things.,89.0,251764.0,86.0,19630.0,89.0,259777.0,86.0,21234.0
2,2 - Staff have made the subject interesting.,82.0,251586.0,82.0,19661.0,82.0,259593.0,80.0,21227.0
3,3 - The course is intellectually stimulating.,85.0,251659.0,89.0,20002.0,84.0,259704.0,88.0,21553.0
4,4 - My course has challenged me to achieve my ...,81.0,251723.0,84.0,19993.0,81.0,259749.0,84.0,21536.0
5,Learning opportunities,,,,,,,,
6,5 - My course has provided me with opportuniti...,84.0,251730.0,85.0,19980.0,84.0,259722.0,85.0,21538.0
7,6 - My course has provided me with opportuniti...,85.0,251561.0,84.0,19931.0,85.0,259638.0,83.0,21501.0
8,7 - My course has provided me with opportuniti...,81.0,251519.0,79.0,19792.0,81.0,259525.0,80.0,21361.0
9,Assessment and feedback,,,,,,,,


# Cleaning

In [4]:
def weighted_avg(file, idx, left_side):
    if left_side:
        return (file.iloc[idx][1] * file.iloc[idx][2] + file.iloc[idx][3] * file.iloc[idx][4]) / (file.iloc[idx][2]+ file.iloc[idx][4])
    return (file.iloc[idx][5] * file.iloc[idx][6] + file.iloc[idx][7] * file.iloc[idx][8]) / (file.iloc[idx][6]+ file.iloc[idx][8])
    
def clean(file, sheet_name, left_side):
    df = file.get(sheet_name).copy()
    df.dropna(axis = "rows", inplace = True) #drop empty rows - just text
    df.reset_index(inplace = True, drop = True) #reset index

    average_pct = []
    response_counts = []
    for i in range(len(df)):
        average_pct.append(weighted_avg(df, i, left_side))
        if left_side:
            response_counts.append(df.iloc[i][2] + df.iloc[i][4])
        else:
            response_counts.append(df.iloc[i][6] + df.iloc[i][8]) 
        
    df.drop(df.columns[[1,2,3,4,5,6,7,8]], axis=1, inplace=True) #drop columns, no longer needed
    df["score"] = average_pct #add score as feature
    
    df = df.T #set transpose as we want Qs af features
    df.rename(columns= lambda x: "Q"+str(x+1), inplace = True) #rename
    df.drop("Unnamed: 0", inplace = True, axis = "rows") #drop question row, now in header
    df.rename(index = {"score": sheet_name}, inplace = True) #keep track of rows
    df["avg_res"] = sum(response_counts)/df.shape[1] #add respsonse count
    
    return df

In [5]:
df_18 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(data_18_19, sheet, True)
    df_18 = pd.concat([df_18, temp_df])
    
df_21 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(data_21_22, sheet, True)
    df_21 = pd.concat([df_21, temp_df])

df_19 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(data_18_19, sheet, False)
    df_19 = pd.concat([df_19, temp_df])
    
df_22 = pd.DataFrame()
for sheet in CHOSEN_SHEETS:
    temp_df = clean(data_21_22, sheet, False)
    df_22 = pd.concat([df_22, temp_df])

In [6]:
df_18

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,avg_res
England,88.783009,82.0,85.294514,81.220742,84.073534,84.926587,80.854101,72.515288,72.735531,74.588751,...,86.640317,86.784395,69.058393,83.723455,83.489709,75.49598,60.939315,56.41581,83.147429,270038.925926
Scotland,88.915341,82.943519,85.028306,80.0,83.028351,83.971685,78.028175,71.198124,73.170001,64.113283,...,86.887737,87.806887,67.77597,85.832705,85.886224,73.94359,54.943773,51.825386,83.0,24364.407407
Wales,89.928585,83.035739,84.964162,82.0,83.928452,84.928407,81.96437,75.107176,75.248433,75.142281,...,86.754616,86.649433,71.788982,84.930974,85.785941,78.928424,63.750517,58.633181,84.928082,16035.259259
Northern Ireland,89.048171,82.096186,85.192734,83.240948,83.192759,85.192857,84.191424,73.337629,75.385665,69.24141,...,87.668809,87.71811,69.904445,87.809792,83.855445,75.0,58.952546,58.55105,84.096836,7804.962963


In [7]:
df_19

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,avg_res
England,88.773311,81.848821,84.306524,81.229689,84.076577,84.847044,80.923951,72.53647,72.689258,74.612928,...,86.625307,86.849992,68.018092,83.595529,84.315797,75.399566,60.891646,55.389358,83.153535,279559.666667
Scotland,90.0,83.082465,86.054954,81.10956,84.027478,84.109649,79.109531,72.16487,73.164699,66.219935,...,87.973236,87.973452,66.733208,85.839229,86.945127,74.972476,55.862898,52.903535,84.055054,24337.148148
Wales,89.943368,83.971594,85.0,82.02851,84.886215,85.91481,82.971538,75.028453,75.113862,74.943097,...,86.860723,87.799825,71.915336,85.944758,86.85754,79.0,65.83128,58.757871,84.913909,16123.888889
Northern Ireland,89.0,82.314367,85.314734,83.314774,84.208794,85.209823,84.208901,74.419645,74.523473,68.366478,...,88.58447,88.534422,67.896134,88.739013,83.790287,74.105019,59.79021,56.445635,84.947436,7743.962963


In [8]:
df_21

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q29,Q30,Q31,Q32,Q33,Q34,Q35,Q36,Q37,avg_res
England,79.6214,84.114773,77.539045,81.194951,75.723991,78.819551,79.360668,81.087338,76.072518,68.802645,...,73.966175,66.179688,78.94068,68.439542,51.170119,53.017989,53.017989,74.888116,74.888116,280585.0
Scotland,82.614836,86.698185,81.298109,84.501031,78.014815,80.450742,81.675728,83.240454,76.514438,66.043233,...,79.302857,67.651472,84.769109,69.114387,48.968171,53.489959,53.489959,79.569913,79.569913,25273.27027
Wales,80.619479,85.018889,79.362901,81.533884,76.600646,79.320056,79.252699,81.297968,77.49897,69.451691,...,76.464819,68.464556,81.292976,69.957475,54.188178,55.662884,55.662884,76.039412,76.039412,16004.081081
Northern Ireland,81.001466,85.351193,77.287827,82.433958,78.988663,80.595194,79.506276,81.457755,80.972493,67.619425,...,79.857942,67.730919,79.214349,69.751559,54.343294,57.047315,57.047315,79.489939,79.489939,8072.351351


In [9]:
df_22

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q29,Q30,Q31,Q32,Q33,Q34,Q35,Q36,Q37,avg_res
England,79.85275,84.180621,77.785883,81.415474,76.083115,79.323778,79.570588,81.361659,77.108443,68.771016,...,75.751212,66.463558,78.834688,68.657262,51.896535,52.524039,52.524039,75.967532,75.967532,274199.810811
Scotland,81.604588,86.227028,79.894175,83.183348,77.17911,79.496092,80.782648,81.976546,75.775117,65.438093,...,75.727788,66.333838,83.3867,67.727207,47.704054,50.530992,50.530992,78.579104,78.579104,26208.324324
Wales,80.432099,85.004882,78.930552,81.545257,76.313212,78.944488,78.777967,81.323299,76.80596,69.593944,...,74.023325,68.028858,80.495216,69.950839,53.640138,55.275436,55.275436,76.805504,76.805504,14302.432432
Northern Ireland,81.18855,85.20391,78.109806,82.606289,78.915585,81.480948,80.588159,83.1,80.895262,68.224342,...,81.316202,67.296968,79.091098,68.879838,54.05285,55.682932,55.682932,79.347977,79.347977,7078.351351
