# Imports 

In [3]:
import numpy as np
import pandas as pd

# Get the data

In [8]:
%%time

CHOSEN_SHEETS_SUMMARY = ["England", "Scotland", "Wales", "Northern Ireland"]
CHOSEN_SHEETS_UNIS = ["Q01", "Q02", "Q03", "Q04", "Q05", "Q06", "Q07", "Q08", "Q09", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15",
                     "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"]
CHOSEN_SHEETS_COURSE = ["NSS1", "NSS2", "NSS3"]

summary_data_1819 = pd.read_excel(r"data/summary_data_2018_2019.xls", skiprows = 5, nrows = 37, sheet_name = CHOSEN_SHEETS_SUMMARY) #skip nhs placement questions
summary_data_2122 = pd.read_excel(r"data/summary_data_2021_2022.xls", skiprows = 5, nrows = 37, sheet_name = CHOSEN_SHEETS_SUMMARY) #skip nhs placement questions

unis_data_1819 = pd.read_excel(r"data/unis_2018_2019.xls", skiprows = 5, sheet_name = CHOSEN_SHEETS_UNIS, header = None)
unis_data_2122 = pd.read_excel(r"data/unis_2021_2022.xls", skiprows = 5, sheet_name = CHOSEN_SHEETS_UNIS, header = None)

course_data_18 = pd.read_excel(r"data/course_data_2018.xls", skiprows = 4, sheet_name = CHOSEN_SHEETS_COURSE, header = None)
course_data_19 = pd.read_excel(r"data/course_data_2019.xls", skiprows = 4, sheet_name = CHOSEN_SHEETS_COURSE, header = None)
course_data_21 = pd.read_excel(r"data/course_data_2021.xls", skiprows = 4, sheet_name = CHOSEN_SHEETS_COURSE, header = None)
course_data_22 = pd.read_excel(r"data/course_data_2022.xls", skiprows = 4, sheet_name = CHOSEN_SHEETS_COURSE, header = None)

Wall time: 48.1 s


# Cleaning

#### 1. Cleaning summary data

In [16]:
def weighted_avg(file, idx, left_side):
    if left_side:
        return (file.iloc[idx][1] * file.iloc[idx][2] + file.iloc[idx][3] * file.iloc[idx][4]) / (file.iloc[idx][2]+ file.iloc[idx][4])
    return (file.iloc[idx][5] * file.iloc[idx][6] + file.iloc[idx][7] * file.iloc[idx][8]) / (file.iloc[idx][6]+ file.iloc[idx][8])
    
def clean(file, sheet_name, left_side):
    #optimise so that first columns are dropped after usage so left_side not needed, and copy data so can be run multiple times
    df = file.get(sheet_name).copy()
    df.dropna(axis = "rows", inplace = True) #drop empty rows - just text
    df.reset_index(inplace = True, drop = True) #reset index

    average_pct = []
    response_counts = []
    for i in range(len(df)):
        average_pct.append(weighted_avg(df, i, left_side))
        if left_side:
            response_counts.append(df.iloc[i][2] + df.iloc[i][4])
        else:
            response_counts.append(df.iloc[i][6] + df.iloc[i][8]) 
        
    df.drop(df.columns[[1,2,3,4,5,6,7,8]], axis=1, inplace=True) #drop columns, no longer needed
    df["score"] = average_pct #add score as feature
    
    df = df.T #set transpose as we want Qs af features
    df.rename(columns= lambda x: "Q"+str(x+1), inplace = True) #rename
    df.drop("Unnamed: 0", inplace = True, axis = "rows") #drop question row, now in header
    df.rename(index = {"score": sheet_name}, inplace = True) #keep track of rows
    df["avg_res"] = sum(response_counts)/df.shape[1] #add respsonse count
    
    return df

In [17]:
df_18 = pd.DataFrame()
for sheet in CHOSEN_SHEETS_SUMMARY:
    temp_df = clean(summary_data_1819, sheet, True)
    df_18 = pd.concat([df_18, temp_df])
    
df_21 = pd.DataFrame()
for sheet in CHOSEN_SHEETS_SUMMARY:
    temp_df = clean(summary_data_2122, sheet, True)
    df_21 = pd.concat([df_21, temp_df])

df_19 = pd.DataFrame()
for sheet in CHOSEN_SHEETS_SUMMARY:
    temp_df = clean(summary_data_1819, sheet, False)
    df_19 = pd.concat([df_19, temp_df])
    
df_22 = pd.DataFrame()
for sheet in CHOSEN_SHEETS_SUMMARY:
    temp_df = clean(summary_data_2122, sheet, False)
    df_22 = pd.concat([df_22, temp_df])
    
#take average of two years
summary_1819 = (df_18 + df_19) / 2
summary_2122 = (df_21 + df_22) / 2

#add total row by getting weighted avg and sum of repsonses
values_1819 = []
for i in range(summary_1819.shape[1]-1):
    values_1819.append(sum(summary_1819.iloc[:,i] * summary_1819.iloc[:,-1]) / sum(summary_1819.iloc[:,-1]))
values_1819.append(sum(summary_1819.iloc[:,-1]))

values_2122 = []
for i in range(summary_2122.shape[1]-1):
    values_2122.append(sum(summary_2122.iloc[:,i] * summary_2122.iloc[:,-1]) / sum(summary_2122.iloc[:,-1]))
values_2122.append(sum(summary_2122.iloc[:,-1]))

summary_1819.loc["UK"] = np.array(values_1819)
summary_2122.loc["UK"] = np.array(values_2122)

#### 2. Cleaning unis data

In [18]:
def clean_unis(file, sheet_name):
    df = file.get(sheet_name).copy()
    df.drop([0,3,4,5,6,7,9,10,11,12,13,14], inplace = True, axis = "columns")
    df.dropna(inplace = True, axis = "rows")
    df.reset_index(inplace = True, drop = True) #reset row index
    df[sheet_name] = (df.iloc[:,1] + df.iloc[:,2]) / 2 #add average result of the two years
    df.rename(columns = {1:'University'}, inplace = True) #rename column
    df.drop([2,8], inplace = True, axis = "columns")
    
    return df

In [19]:
for sheet in CHOSEN_SHEETS_UNIS:
    #create df for q1, keep adding scores only afterwards, uni names are returned for clarity
    if sheet == "Q01":
        unis_1819 = clean_unis(unis_data_1819, sheet)
    else:
        unis_1819[sheet] = clean_unis(unis_data_1819, sheet).iloc[:,1]
        
for sheet in CHOSEN_SHEETS_UNIS:
    #create df for q1, keep adding scores only afterwards, uni names are returned for clarity
    if sheet == "Q01":
        unis_2122 = clean_unis(unis_data_2122, sheet)
    else:
        unis_2122[sheet] = clean_unis(unis_data_2122, sheet).iloc[:,1]

#### 3. Cleaning course data

In [23]:
course_data_18 = pd.read_excel(r"data/course_data_2018.xls", skiprows = 4, sheet_name = CHOSEN_SHEETS_COURSE, header = None)


In [62]:
df = course_data_18.get("NSS1").copy()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,10007783,University of Aberdeen,CAH01,Medicine and dentistry,First degree,Q01,0.0000,0.0270,0.0090,0.4955,0.4685,0,0.9030,0.9640,0.9872,111,161,
1,10007783,University of Aberdeen,CAH01,Medicine and dentistry,First degree,Q02,0.0182,0.0273,0.0545,0.3636,0.5364,1,0.8206,0.9000,0.9465,110,161,
2,10007783,University of Aberdeen,CAH01,Medicine and dentistry,First degree,Q03,0.0000,0.0000,0.0450,0.1532,0.8018,0,0.8907,0.9550,0.9822,111,161,
3,10007783,University of Aberdeen,CAH01,Medicine and dentistry,First degree,Q04,0.0000,0.0360,0.0811,0.3063,0.5766,0,0.8006,0.8829,0.9340,111,161,
4,10007783,University of Aberdeen,CAH01,Medicine and dentistry,First degree,Q05,0.0000,0.0270,0.0360,0.3694,0.5676,0,0.8670,0.9369,0.9713,111,161,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65527,10006022,Solent University,CAH04,Psychology,First degree,Q02,0.0000,0.0413,0.1857,0.6460,0.1271,0,0.6219,0.7731,0.8758,59,79,
65528,10006022,Solent University,CAH04,Psychology,First degree,Q03,0.0206,0.0825,0.0722,0.5086,0.3161,0,0.6791,0.8246,0.9126,59,79,
65529,10006022,Solent University,CAH04,Psychology,First degree,Q04,0.0206,0.1238,0.1168,0.4640,0.2748,0,0.5852,0.7388,0.8501,59,79,
65530,10006022,Solent University,CAH04,Psychology,First degree,Q05,0.0000,0.1444,0.0996,0.4605,0.2954,0,0.6035,0.7559,0.8631,59,79,


In [63]:
df.drop([0,1,2,4,6,7,8,9,10,11,12,14,15,16,17], inplace = True, axis = "columns")

In [64]:
df[3].unique()

array(['Medicine and dentistry', 'Subjects allied to medicine',
       'Biological and sport sciences', 'Psychology', 'Physical sciences',
       'Mathematical sciences', 'Engineering and technology', 'Computing',
       'Geographical and environmental studies', 'Social sciences', 'Law',
       'Business and management', 'Language and area studies',
       'Historical, philosophical and religious studies',
       'Creative arts and design', 'Education and teaching',
       'Agriculture, food and related studies',
       'Communications and media', 'Veterinary sciences',
       'Architecture, building and planning',
       'Combined and general studies',
       'Humanities and liberal arts (non-specific)'], dtype=object)

In [58]:
len(df[3].unique())

142

In [59]:
test = df.groupby([5,3]).mean(numeric_only = True)

In [60]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,13
5,3,Unnamed: 2_level_1
NHS1,Adult nursing,0.776710
NHS1,"Anatomy, physiology and pathology",0.700000
NHS1,Children's nursing,0.800100
NHS1,Complementary and alternative medicine,0.888100
NHS1,"Counselling, psychotherapy and occupational therapy",0.814764
...,...,...
Q27,Theology and religious studies,0.931430
Q27,"Tourism, transport and travel",0.820103
Q27,Veterinary medicine and dentistry,0.880050
Q27,Welsh studies,0.956400


In [61]:
test.iloc[12]

13    0.8326
Name: (NHS1, Nutrition and dietetics), dtype: float64

# Save and load data

In [21]:
summary_1819.to_pickle("data/summary_1819.pkl")
summary_2122.to_pickle("data/summary_2122.pkl")
unis_1819.to_pickle("data/unis_1819.pkl")
unis_2122.to_pickle("data/unis_2122.pkl")

In [22]:
pd.read_pickle("data/summary_1819.pkl")

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,avg_res
England,88.77816,81.924411,84.800519,81.225215,84.075056,84.886815,80.889026,72.525879,72.712395,74.60084,...,86.632812,86.817193,68.538242,83.659492,83.902753,75.447773,60.91548,55.902584,83.150482,274799.296296
Scotland,89.457671,83.012992,85.54163,80.55478,83.527915,84.040667,78.568853,71.681497,73.16735,65.166609,...,87.430486,87.890169,67.254589,85.835967,86.415676,74.458033,55.403336,52.364461,83.527527,24350.777778
Wales,89.935977,83.503667,84.982081,82.014255,84.407333,85.421609,82.467954,75.067815,75.181148,75.042689,...,86.807669,87.224629,71.852159,85.437866,86.321741,78.964212,64.790899,58.695526,84.920995,16079.574074
Northern Ireland,89.024086,82.205276,85.253734,83.277861,83.700776,85.20134,84.200163,73.878637,74.954569,68.803944,...,88.12664,88.126266,68.900289,88.274402,83.822866,74.552509,59.371378,57.498343,84.522136,7774.462963
UK,88.892944,82.091855,84.876337,81.263357,84.04134,84.857219,80.87241,72.621323,72.923558,73.772077,...,86.737608,86.949874,68.615155,84.02318,84.210696,75.526663,60.655686,55.813295,83.30006,323004.111111


In [23]:
pd.read_pickle("data/summary_2122.pkl")

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q29,Q30,Q31,Q32,Q33,Q34,Q35,Q36,Q37,avg_res
England,79.737075,84.147697,77.662464,81.305213,75.903553,79.071665,79.465628,81.224499,76.59048,68.78683,...,74.858694,66.321623,78.887684,68.548402,51.533327,52.771014,52.771014,75.427824,75.427824,277392.405405
Scotland,82.109712,86.462607,80.596142,83.84219,77.596963,79.973417,81.229188,82.6085,76.144778,65.740663,...,77.515323,66.992655,84.077905,68.420797,48.336113,52.010475,52.010475,79.074508,79.074508,25740.797297
Wales,80.525789,85.011885,79.146727,81.53957,76.456929,79.132272,79.015333,81.310633,77.152465,69.522817,...,75.244072,68.246707,80.894096,69.954157,53.914158,55.46916,55.46916,76.422458,76.422458,15153.256757
Northern Ireland,81.095008,85.277551,77.698817,82.520124,78.952124,81.038071,80.047217,82.278878,80.933878,67.921884,...,80.587072,67.513943,79.152724,69.315699,54.198072,56.365123,56.365123,79.418958,79.418958,7575.351351
UK,79.992741,84.397011,77.96407,81.544757,76.133924,79.191428,79.597518,81.362342,76.682378,68.560322,...,75.219638,66.491868,79.397139,68.62153,51.45343,52.919959,52.919959,75.854922,75.854922,325861.810811


In [24]:
pd.read_pickle("data/unis_1819.pkl")

Unnamed: 0,University,Q01,Q02,Q03,Q04,Q05,Q06,Q07,Q08,Q09,...,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27
0,University of Aberdeen,91.345,84.160,88.010,80.060,85.455,83.625,76.190,72.820,77.140,...,84.495,90.445,90.810,72.345,87.335,90.275,78.440,58.760,44.100,86.645
1,Abertay University,90.895,84.025,82.300,80.950,83.200,85.370,78.330,77.285,76.705,...,87.470,88.060,89.715,68.865,86.465,88.630,76.885,59.520,53.505,82.965
2,Aberystwyth University,95.120,89.995,90.100,85.585,89.530,89.910,86.170,84.175,82.035,...,91.680,91.530,91.270,75.230,88.050,89.925,85.540,77.150,58.170,90.470
3,Abingdon and Witney College,94.310,92.630,91.380,87.960,84.600,88.250,82.260,78.610,82.040,...,75.910,75.335,83.585,71.545,89.340,86.545,87.170,72.340,47.770,89.215
4,ACM Guildford Limited,80.725,68.250,51.025,50.355,65.165,65.820,65.425,52.195,60.040,...,61.215,53.395,64.325,59.545,75.470,65.185,61.785,42.940,36.845,50.085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,York College,88.030,82.215,77.800,77.185,73.715,76.060,78.795,71.130,79.185,...,67.935,74.605,70.590,65.855,76.210,72.150,74.320,52.810,36.535,75.000
412,York St John University,91.440,85.700,83.405,80.440,86.385,85.280,82.690,76.280,72.990,...,86.140,88.790,86.960,71.890,85.160,87.645,80.175,62.745,56.920,85.470
413,Medway School of Pharmacy,86.960,82.100,85.215,77.310,81.055,87.180,84.385,62.600,64.405,...,79.510,86.950,88.235,57.845,86.600,83.455,72.000,69.220,49.530,84.800
414,Hull and York Medical School,83.010,86.460,92.515,78.320,80.010,80.805,93.295,62.615,62.110,...,78.320,91.735,91.980,79.100,91.225,84.935,54.185,41.100,49.440,76.725


In [25]:
pd.read_pickle("data/unis_2122.pkl")

Unnamed: 0,University,Q01,Q02,Q03,Q04,Q05,Q06,Q07,Q08,Q09,...,Q18,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27
0,University of Aberdeen,89.710,83.805,87.465,80.030,83.305,84.815,76.645,68.720,73.830,...,80.765,83.885,84.755,65.650,78.815,90.235,73.165,53.870,52.595,85.085
1,Abertay University,89.885,82.375,82.535,77.345,82.740,81.375,78.440,74.410,72.840,...,78.040,77.545,79.580,55.545,77.540,84.605,69.930,52.170,45.015,80.000
2,Aberystwyth University,91.130,85.960,87.490,79.215,84.445,85.285,80.410,79.010,78.460,...,83.050,82.975,83.150,66.420,79.960,89.240,79.970,68.370,61.290,85.165
3,Abingdon and Witney College,84.135,83.975,75.960,72.115,79.710,86.060,83.560,66.500,77.210,...,72.885,71.155,73.890,65.865,81.060,82.980,86.500,65.415,54.420,84.500
4,ACM Guildford Limited,72.950,66.890,56.565,56.515,59.605,65.260,59.180,58.430,57.515,...,50.790,50.445,52.125,43.700,61.310,49.255,55.705,29.470,28.670,44.585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,University of York,87.670,82.130,86.295,74.125,83.050,83.740,73.430,66.965,69.055,...,80.180,83.165,83.420,59.365,75.970,84.455,67.900,44.450,41.100,78.975
408,York College,84.225,88.335,83.150,86.485,88.150,86.295,84.445,66.475,71.650,...,75.715,85.930,76.280,82.960,70.555,84.225,75.930,70.555,71.130,82.595
409,York St John University,86.610,81.475,82.565,78.455,81.960,82.590,76.970,73.375,73.825,...,75.420,80.700,78.585,65.885,78.710,82.640,73.895,54.250,56.750,77.995
410,Hull and York Medical School,77.170,82.070,92.415,77.335,69.335,80.280,93.980,57.075,63.960,...,73.915,85.540,89.470,70.660,88.980,79.655,55.170,42.885,36.350,73.625
