In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1]:
### Explore National Student Survey Scale data and reformat

In [None]:
data_url = 'https://www.officeforstudents.org.uk/advice-and-guidance/student-information-and-data/national-student-survey-nss/get-the-nss-data/'

In [2]:
aorig = pd.read_excel('NSS Answers Stats.xlsx', sheet_name='NSSFULLTIME3', skiprows=3)

print(aorig.shape)
aorig.head()

(243080, 18)


Unnamed: 0,UKPRN,Provider,Subject Code,Subject,Level,Question Number,Answered 1,Answered 2,Answered 3,Answered 4,Answered 5,N/A,Confidence interval - min,Actual value,Confidence interval - max,Response,Sample Size,Two years aggregate data?
0,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Q01,0.0,0.0202,0.0101,0.5253,0.4444,0,0.9061,0.9697,0.9907,99,173,
1,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Q02,0.0,0.0,0.0909,0.5152,0.3939,0,0.8265,0.9091,0.9545,99,173,
2,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Q03,0.0,0.0202,0.0202,0.1818,0.7778,0,0.8919,0.9596,0.9856,99,173,
3,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Q04,0.0,0.0303,0.0505,0.3636,0.5556,0,0.839,0.9192,0.9613,99,173,
4,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale01,0.0,0.0177,0.0429,0.3965,0.5429,0,0.8648,0.9394,0.9741,99,173,


In [3]:
# no missing data
aorig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243080 entries, 0 to 243079
Data columns (total 18 columns):
UKPRN                        243080 non-null int64
Provider                     243080 non-null object
Subject Code                 243080 non-null object
Subject                      243080 non-null object
Level                        243080 non-null object
Question Number              243080 non-null object
Answered 1                   243080 non-null float64
Answered 2                   243080 non-null float64
Answered 3                   243080 non-null float64
Answered 4                   243080 non-null float64
Answered 5                   243080 non-null float64
N/A                          243080 non-null int64
Confidence interval - min    243080 non-null float64
Actual value                 243080 non-null float64
Confidence interval - max    243080 non-null float64
Response                     243080 non-null int64
Sample Size                  243080 non-null int64
T

## Explore Other Undergraduate

In [4]:
aother = aorig[aorig.Level == 'Other undergraduate']
areg = aorig[aorig.Level == 'First degree']

print(aother.shape)
print(areg.shape)

(38914, 18)
(204166, 18)


In [5]:
print(len(aother.Provider.unique()))
print(len(areg.Provider.unique()))

300
298


In [6]:
print(len(aother.Subject.unique()))
print(len(areg.Subject.unique()))

93
150


In [7]:
# Other undergraduate is likely more specialized / cert or diploma
# should keep
aother.Subject.value_counts()[:10]

Education                                2940
Sport and exercise sciences              2555
Computer science                         2205
Business studies                         1995
Others in subjects allied to medicine    1890
Tourism, transport and travel            1785
Design studies                           1400
Cinematics and photography               1365
Management studies                       1294
Health studies                           1274
Name: Subject, dtype: int64

In [8]:
areg.Subject.value_counts()[:10]

Business studies             4375
Psychology (non-specific)    4235
Sociology                    4130
Law                          4095
Drama                        4060
Design studies               3990
Computer science             3850
Accounting                   3780
Music                        3605
Management studies           3535
Name: Subject, dtype: int64

## Build Dataset

In [45]:
len(aorig.Subject.unique())

151

In [46]:
q_keep = ['Scale01','Scale02','Scale03','Scale04','Scale05','Scale06','Scale07','Scale08','Q26','Q27',]

aedit = aorig[aorig['Question Number'].isin(q_keep)]

print(aedit.shape)
aedit.head(2)

(68614, 18)


Unnamed: 0,UKPRN,Provider,Subject Code,Subject,Level,Question Number,Answered 1,Answered 2,Answered 3,Answered 4,Answered 5,N/A,Confidence interval - min,Actual value,Confidence interval - max,Response,Sample Size,Two years aggregate data?
4,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale01,0.0,0.0177,0.0429,0.3965,0.5429,0,0.8648,0.9394,0.9741,99,173,
8,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale02,0.0034,0.0303,0.0505,0.2997,0.6162,0,0.8348,0.9158,0.959,99,173,


In [47]:
# may consider filtering out subjects < 100
# 10 rows for questions  == per college; depending on Level
aedit['Subject'].value_counts()[-20:]

Biosciences (non-specific)                40
Physical sciences (non-specific)          40
Hair and beauty sciences                  40
Welsh studies                             40
Forestry and arboriculture                40
Others in engineering                     40
Materials science                         40
Operational research                      40
Others in English studies                 40
Others in computing                       40
Naval architecture                        40
Polymers and textiles                     30
Minerals technology                       30
Celtic studies (non-specific)             20
Heritage studies                          20
Community nursing                         20
Others in Celtic studies                  20
Business computing                        10
Personal development                      10
Business and management (non-specific)    10
Name: Subject, dtype: int64

In [48]:
small_subject = aedit['Subject'].value_counts()
small_subject = small_subject[small_subject < 100]

print(len(small_subject))
small_subject = small_subject.index.tolist()
small_subject[:10]

37


['Veterinary medicine and dentistry',
 'Humanities (non-specific)',
 'African and modern Middle Eastern studies',
 'Creative arts and design (non-specific)',
 'Publishing',
 'Combined, general or negotiated studies',
 'Landscape design',
 'Development studies',
 'Maritime technology',
 'Food sciences']

In [49]:
# subjects with less than 100 rows removed
# i.e. subjects taught at < 10 universities
aedit = aedit[aedit.Subject.isin(small_subject) ==False]

len(aedit.Subject.unique())

114

In [50]:
aedit = aedit[aedit.Response >= 10]

aedit.shape

(66584, 18)

In [51]:
aedit = aedit[['UKPRN', 'Provider', 'Subject Code', 'Subject', 'Level',
               'Question Number','Actual value', 'Response']]

aedit.head(2)

Unnamed: 0,UKPRN,Provider,Subject Code,Subject,Level,Question Number,Actual value,Response
4,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale01,0.9394,99
8,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale02,0.9158,99


In [52]:
aedit.describe()

Unnamed: 0,UKPRN,Actual value,Response
count,66584.0,66584.0,66584.0
mean,10074780.0,0.776012,52.889072
std,2465184.0,0.140677,61.915694
min,10000060.0,0.0,10.0
25%,10004800.0,0.7022,16.0
50%,10007160.0,0.8011,30.0
75%,10007800.0,0.8765,65.0
max,100000000.0,1.0,1514.0


In [55]:
# remaining features
print(f"Schools remaining: {len(aedit.Provider.unique())}")
print(f"Subjects remaining: {len(aedit.Subject.unique())}")
print(aedit.Level.value_counts())

Schools remaining: 445
Subjects remaining: 114
First degree           55931
Other undergraduate    10653
Name: Level, dtype: int64


## Add country

In [57]:
cn_df = pd.read_excel('NSS Answers.xlsx', sheet_name='Scale 1', skiprows=4)

cn_df.head(2)

Unnamed: 0,UKPRN,Provider,Agree (%),Benchmark (%),+/-,Agree (%).1,Benchmark (%).1,+/-.1,Agree (%).2,Benchmark (%).2,+/-.2,Agree (%).3,Benchmark (%).3,+/-.3,Country
0,10007783,University of Aberdeen,85.4,85.0,,85.4,85.0,,86.39,84.91,,86.39,84.91,,Scotland
1,10007849,University of Abertay Dundee,81.64,84.14,,81.64,84.14,,87.44,83.89,,87.44,83.89,,Scotland


In [59]:
cn_df = cn_df[['UKPRN', 'Country']]

cn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 2 columns):
UKPRN      451 non-null int64
Country    451 non-null object
dtypes: int64(1), object(1)
memory usage: 7.1+ KB


In [60]:
aedit = pd.merge(aedit, cn_df, on='UKPRN')

print(aedit.info())
aedit.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66177 entries, 0 to 66176
Data columns (total 9 columns):
UKPRN              66177 non-null int64
Provider           66177 non-null object
Subject Code       66177 non-null object
Subject            66177 non-null object
Level              66177 non-null object
Question Number    66177 non-null object
Actual value       66177 non-null float64
Response           66177 non-null int64
Country            66177 non-null object
dtypes: float64(1), int64(2), object(6)
memory usage: 5.0+ MB
None


Unnamed: 0,UKPRN,Provider,Subject Code,Subject,Level,Question Number,Actual value,Response,Country
0,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale01,0.9394,99,Scotland
1,10007783,University of Aberdeen,CAH01-01-02,Medicine (non-specific),First degree,Scale02,0.9158,99,Scotland


In [61]:
# aedit.to_csv('he_choose_detail.csv')

In [62]:
aedit.Level.unique()

array(['First degree', 'Other undergraduate'], dtype=object)

In [66]:
piv_test = pd.pivot_table(aedit, values='Response', index=['Country', 'Level'], aggfunc=np.sum)

piv_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Response
Country,Level,Unnamed: 2_level_1
England,First degree,2749300
England,Other undergraduate,216982
Northern Ireland,First degree,72761
Northern Ireland,Other undergraduate,8325
Scotland,First degree,276436
Scotland,Other undergraduate,2355
Wales,First degree,165697
Wales,Other undergraduate,12636


In [68]:
aedit['Question Number'].value_counts()

Scale04    6642
Scale02    6642
Scale06    6642
Scale05    6642
Scale03    6642
Scale01    6642
Scale07    6641
Scale08    6637
Q27        6629
Q26        6418
Name: Question Number, dtype: int64