In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import FactorAnalysis, PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [2]:
df = pd.read_csv("ZGP_survey_data.csv")  # read csv file
print(df.columns)

Index(['Which grade do you belong in?', 'Gender', 'Age', 'Nationality',
       'Which department do you belong in?',
       'I am satisfied with the quality of life as an university student',
       'I am satisfied with my overall performance at university class ',
       'I am getting the most out of my university education ',
       'I feel a sense of accomplishment from my campus activities',
       'I am proud of the effort I am putting on the university class ',
       'I would recommend the university to my friend or family member based on my experience on campus',
       'I am satisfied with my overall performance outside the campus ',
       'I feel that I am accomplishing something outside of school',
       'I can be proud of my extracurricular activity',
       'I am satisfied with my activeness outside the campus',
       'I am satisfied with the variety of out-of-campus opportunities',
       'I am satisfied with the progress I am making with job hunting',
       'I am sat

In [3]:
df.head(5)

Unnamed: 0,Which grade do you belong in?,Gender,Age,Nationality,Which department do you belong in?,I am satisfied with the quality of life as an university student,I am satisfied with my overall performance at university class,I am getting the most out of my university education,I feel a sense of accomplishment from my campus activities,I am proud of the effort I am putting on the university class,...,I feel good and refreshed when I wake up in the morning,I don’t get sleepy in the middle of the day,What time do you often go to sleep?,How many hours do you sleep on average?,"How many days per week do you engage in moderate to vigorous physical activity, such as brisk walking, running, or playing sports?",How many servings of fruits and vegetables do you consume on an average day?,"How often do you consume foods that are high in saturated fat, added sugar, or salt?","On a scale of 1 to 5, how confident are you that you are following a healthy diet? (1 = not confident at all, 5 = very confident)",I am satisfied with the quality of life as an university student.1,Unnamed: 34
0,2nd,male,20,Japan,Economics PEARL,4,4,4,4,4,...,4,4,0:00~1:00,7~9 hours,7 days,Very often (daily),Sometimes (2-3 times a week),4,4,
1,2nd,female,19,Outside Japan,Economics PEARL,4,3,4,4,3,...,4,5,0:00~1:00,4~7 hours,5-6 days,Often (4-5 times a week),Sometimes (2-3 times a week),4,4,
2,2nd,male,20,Japan,Economics PEARL,4,1,1,3,1,...,4,4,23:00~0:00,7~9 hours,7 days,Rarely (once a week or less),Sometimes (2-3 times a week),3,4,
3,2nd,female,20,Japan,Economics PEARL,4,4,2,2,3,...,4,2,0:00~1:00,7~9 hours,1-2 days,Very often (daily),Often (4-5 times a week),4,2,
4,4th,female,22,Japan,Economics PEARL,4,4,2,3,2,...,1,1,0:00~1:00,4~7 hours,3-4 days,Often (4-5 times a week),Very often (daily),2,2,


In [4]:
# there is a meaningless column at the end so drop it
df = df.drop(columns = ["Unnamed: 34"])


In [5]:
df.head(5)

Unnamed: 0,Which grade do you belong in?,Gender,Age,Nationality,Which department do you belong in?,I am satisfied with the quality of life as an university student,I am satisfied with my overall performance at university class,I am getting the most out of my university education,I feel a sense of accomplishment from my campus activities,I am proud of the effort I am putting on the university class,...,I would recommend the university to a friend or family member based on my social experience,I feel good and refreshed when I wake up in the morning,I don’t get sleepy in the middle of the day,What time do you often go to sleep?,How many hours do you sleep on average?,"How many days per week do you engage in moderate to vigorous physical activity, such as brisk walking, running, or playing sports?",How many servings of fruits and vegetables do you consume on an average day?,"How often do you consume foods that are high in saturated fat, added sugar, or salt?","On a scale of 1 to 5, how confident are you that you are following a healthy diet? (1 = not confident at all, 5 = very confident)",I am satisfied with the quality of life as an university student.1
0,2nd,male,20,Japan,Economics PEARL,4,4,4,4,4,...,3,4,4,0:00~1:00,7~9 hours,7 days,Very often (daily),Sometimes (2-3 times a week),4,4
1,2nd,female,19,Outside Japan,Economics PEARL,4,3,4,4,3,...,4,4,5,0:00~1:00,4~7 hours,5-6 days,Often (4-5 times a week),Sometimes (2-3 times a week),4,4
2,2nd,male,20,Japan,Economics PEARL,4,1,1,3,1,...,3,4,4,23:00~0:00,7~9 hours,7 days,Rarely (once a week or less),Sometimes (2-3 times a week),3,4
3,2nd,female,20,Japan,Economics PEARL,4,4,2,2,3,...,2,4,2,0:00~1:00,7~9 hours,1-2 days,Very often (daily),Often (4-5 times a week),4,2
4,4th,female,22,Japan,Economics PEARL,4,4,2,3,2,...,3,1,1,0:00~1:00,4~7 hours,3-4 days,Often (4-5 times a week),Very often (daily),2,2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 34 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype 
---  ------                                                                                                                              --------------  ----- 
 0   Which grade do you belong in?                                                                                                       52 non-null     object
 1   Gender                                                                                                                              52 non-null     object
 2   Age                                                                                                                                 52 non-null     int64 
 3   Nationality                                                                                 

In [7]:
# rename the columns to shorter name

df = df.rename(columns = {'Which grade do you belong in?': "Grade", 
       'Which department do you belong in?': "Department",
       'I am satisfied with the quality of life as an university student': "SQ_QOL",
       'I am satisfied with my overall performance at university class ': "SQ_IC_Perform",
       'I am getting the most out of my university education ': "SQ_IC_Effort",
       'I feel a sense of accomplishment from my campus activities': "SQ_IC_Accomp",
       'I am proud of the effort I am putting on the university class ': "SQ_IC_Pride",
       'I would recommend the university to my friend or family member based on my experience on campus': "SQ_IC_Recom",
       'I am satisfied with my overall performance outside the campus ': "SQ_OC_Perform",
       'I feel that I am accomplishing something outside of school': "SQ_OC_Accomp",
       'I can be proud of my extracurricular activity': "SQ_OC_Pride",
       'I am satisfied with my activeness outside the campus': "SQ_OC_Active",
       'I am satisfied with the variety of out-of-campus opportunities': "SQ_OC_VarOp",
       'I am satisfied with the progress I am making with job hunting': "SQ_JH_Prog",
       'I am satisfied with my current career plan ': "SQ_JH_Plan",
       'I am satisfied with the level of my dedication towards my future career': "SQ_JH_Dedi",
       'I feel confident that what I am experiencing will have a positive impact on my future career': "SQ_JH_Exp",
       'I am confident that the network I am building will have a positive impact on my future career': "SQ_JH_Netw",
       'I am satisfied with the way I interact with other students': "SQ_SI_Way",
       'I feel a sense of belonging with the community in the university': "SQ_SI_Belong",
       'I think the interaction with other students is meaningful for my future': "SQ_SI_Future",
       'I would recommend the university to a friend or family member based on my social experience ': "SQ_SI_Recom",
       'I feel good and refreshed when I wake up in the morning': "SQ_SL_Morn",
       'I don’t get sleepy in the middle of the day': "SQ_SL_Mid",
       'What time do you often go to sleep?': "SQ_SL_When",
       'How many hours do you sleep on average?': "SQ_SL_Hour",
       'How many days per week do you engage in moderate to vigorous physical activity, such as brisk walking, running, or playing sports?': "SQ_EXER",
       'How many servings of fruits and vegetables do you consume on an average day?': "SQ_D_Healthy",
       'How often do you consume foods that are high in saturated fat, added sugar, or salt?': "SQ_D_Unhealthy",
       'On a scale of 1 to 5, how confident are you that you are following a healthy diet? (1 = not confident at all, 5 = very confident)': "SQ_D_Conf",
       'I am satisfied with the quality of life as an university student.1': "SQ_QOL_Re"
                     })

In [8]:
df.head(5)

Unnamed: 0,Grade,Gender,Age,Nationality,Department,SQ_QOL,SQ_IC_Perform,SQ_IC_Effort,SQ_IC_Accomp,SQ_IC_Pride,...,SQ_SI_Recom,SQ_SL_Morn,SQ_SL_Mid,SQ_SL_When,SQ_SL_Hour,SQ_EXER,SQ_D_Healthy,SQ_D_Unhealthy,SQ_D_Conf,SQ_QOL_Re
0,2nd,male,20,Japan,Economics PEARL,4,4,4,4,4,...,3,4,4,0:00~1:00,7~9 hours,7 days,Very often (daily),Sometimes (2-3 times a week),4,4
1,2nd,female,19,Outside Japan,Economics PEARL,4,3,4,4,3,...,4,4,5,0:00~1:00,4~7 hours,5-6 days,Often (4-5 times a week),Sometimes (2-3 times a week),4,4
2,2nd,male,20,Japan,Economics PEARL,4,1,1,3,1,...,3,4,4,23:00~0:00,7~9 hours,7 days,Rarely (once a week or less),Sometimes (2-3 times a week),3,4
3,2nd,female,20,Japan,Economics PEARL,4,4,2,2,3,...,2,4,2,0:00~1:00,7~9 hours,1-2 days,Very often (daily),Often (4-5 times a week),4,2
4,4th,female,22,Japan,Economics PEARL,4,4,2,3,2,...,3,1,1,0:00~1:00,4~7 hours,3-4 days,Often (4-5 times a week),Very often (daily),2,2


In [9]:
# create another dataframe where all values are integers 
# range of value is 1~. Theoretically, higher the better for students
# convert all answers to quantitative scale
# for correlation matrix, regression, etc

df_quant = df.replace({"SQ_SL_When":{"Before 22:00": 6, "22:00~23:00": 5, "23:00~0:00": 4, "0:00~1:00": 3, "1:00~2:00": 2, "After 2:00": 1}, 
                      "SQ_SL_Hour":{"more than 9 hours": 2, "7~9 hours": 3, "4~7 hours": 2, "less than 4 hours": 1},
                      "SQ_EXER":{"7 days": 5, "5-6 days": 4, "3-4 days": 3, "1-2 days": 2, "0 days": 1},
                      "SQ_D_Healthy":{"Very often (daily)": 5, "Often (4-5 times a week)": 4, "Sometimes (2-3 times a week)": 3, "Rarely (once a week or less)": 2, "Never": 1},
                      "SQ_D_Unhealthy":{"Very often (daily)": 1, "Often (4-5 times a week)": 2, "Sometimes (2-3 times a week)": 3, "Rarely (once a week or less)": 4, "Never": 5}})
df_quant.head(5)


Unnamed: 0,Grade,Gender,Age,Nationality,Department,SQ_QOL,SQ_IC_Perform,SQ_IC_Effort,SQ_IC_Accomp,SQ_IC_Pride,...,SQ_SI_Recom,SQ_SL_Morn,SQ_SL_Mid,SQ_SL_When,SQ_SL_Hour,SQ_EXER,SQ_D_Healthy,SQ_D_Unhealthy,SQ_D_Conf,SQ_QOL_Re
0,2nd,male,20,Japan,Economics PEARL,4,4,4,4,4,...,3,4,4,3,3,5,5,3,4,4
1,2nd,female,19,Outside Japan,Economics PEARL,4,3,4,4,3,...,4,4,5,3,2,4,4,3,4,4
2,2nd,male,20,Japan,Economics PEARL,4,1,1,3,1,...,3,4,4,4,3,5,2,3,3,4
3,2nd,female,20,Japan,Economics PEARL,4,4,2,2,3,...,2,4,2,3,3,2,5,2,4,2
4,4th,female,22,Japan,Economics PEARL,4,4,2,3,2,...,3,1,1,3,2,3,4,1,2,2


In [10]:
# create dataframe that include just SQ, not student information (grade, gender, age, nationality, dapartment)
df_SQ = df.drop(columns = ["Grade", "Gender", "Age", "Nationality", "Department"])
df_quant_SQ = df_quant.drop(columns = ["Grade", "Gender", "Age", "Nationality", "Department"])



In [11]:
df_SQ.head(5)

Unnamed: 0,SQ_QOL,SQ_IC_Perform,SQ_IC_Effort,SQ_IC_Accomp,SQ_IC_Pride,SQ_IC_Recom,SQ_OC_Perform,SQ_OC_Accomp,SQ_OC_Pride,SQ_OC_Active,...,SQ_SI_Recom,SQ_SL_Morn,SQ_SL_Mid,SQ_SL_When,SQ_SL_Hour,SQ_EXER,SQ_D_Healthy,SQ_D_Unhealthy,SQ_D_Conf,SQ_QOL_Re
0,4,4,4,4,4,4,3,3,3,3,...,3,4,4,0:00~1:00,7~9 hours,7 days,Very often (daily),Sometimes (2-3 times a week),4,4
1,4,3,4,4,3,3,4,5,5,5,...,4,4,5,0:00~1:00,4~7 hours,5-6 days,Often (4-5 times a week),Sometimes (2-3 times a week),4,4
2,4,1,1,3,1,3,5,5,5,5,...,3,4,4,23:00~0:00,7~9 hours,7 days,Rarely (once a week or less),Sometimes (2-3 times a week),3,4
3,4,4,2,2,3,2,4,4,4,4,...,2,4,2,0:00~1:00,7~9 hours,1-2 days,Very often (daily),Often (4-5 times a week),4,2
4,4,4,2,3,2,1,4,5,4,4,...,3,1,1,0:00~1:00,4~7 hours,3-4 days,Often (4-5 times a week),Very often (daily),2,2


In [12]:
df_quant_SQ["SQ_EXER"]

0     5
1     4
2     5
3     2
4     3
5     4
6     2
7     2
8     2
9     3
10    2
11    3
12    3
13    3
14    2
15    3
16    2
17    2
18    2
19    3
20    4
21    4
22    2
23    5
24    3
25    3
26    5
27    3
28    3
29    3
30    2
31    5
32    1
33    2
34    2
35    5
36    4
37    5
38    4
39    4
40    4
41    2
42    3
43    2
44    2
45    2
46    5
47    3
48    2
49    2
50    5
51    2
Name: SQ_EXER, dtype: int64

In [13]:
df_quant_SQ.head(5)

Unnamed: 0,SQ_QOL,SQ_IC_Perform,SQ_IC_Effort,SQ_IC_Accomp,SQ_IC_Pride,SQ_IC_Recom,SQ_OC_Perform,SQ_OC_Accomp,SQ_OC_Pride,SQ_OC_Active,...,SQ_SI_Recom,SQ_SL_Morn,SQ_SL_Mid,SQ_SL_When,SQ_SL_Hour,SQ_EXER,SQ_D_Healthy,SQ_D_Unhealthy,SQ_D_Conf,SQ_QOL_Re
0,4,4,4,4,4,4,3,3,3,3,...,3,4,4,3,3,5,5,3,4,4
1,4,3,4,4,3,3,4,5,5,5,...,4,4,5,3,2,4,4,3,4,4
2,4,1,1,3,1,3,5,5,5,5,...,3,4,4,4,3,5,2,3,3,4
3,4,4,2,2,3,2,4,4,4,4,...,2,4,2,3,3,2,5,2,4,2
4,4,4,2,3,2,1,4,5,4,4,...,3,1,1,3,2,3,4,1,2,2


In [15]:
scaler = StandardScaler().fit(df_quant_SQ)
df_quant_SQ_STD = pd.DataFrame(scaler.fit_transform(df_quant_SQ), columns = df_quant_SQ.columns, index = df_quant_SQ.index)


In [16]:
df_quant_SQ_STD

Unnamed: 0,SQ_QOL,SQ_IC_Perform,SQ_IC_Effort,SQ_IC_Accomp,SQ_IC_Pride,SQ_IC_Recom,SQ_OC_Perform,SQ_OC_Accomp,SQ_OC_Pride,SQ_OC_Active,...,SQ_SI_Recom,SQ_SL_Morn,SQ_SL_Mid,SQ_SL_When,SQ_SL_Hour,SQ_EXER,SQ_D_Healthy,SQ_D_Unhealthy,SQ_D_Conf,SQ_QOL_Re
0,0.533465,0.592667,1.194924,0.90018,0.86724,0.63847,-0.926285,-0.818994,-0.84,-0.62565,...,-0.712627,1.167748,1.521013,-0.041849,1.080123,1.6855,0.955619,0.420426,0.852266,0.457496
1,0.533465,-0.508001,1.194924,0.90018,-0.03469,-0.399043,0.193874,1.073793,1.24,1.407711,...,0.316723,1.167748,2.451515,-0.041849,-0.92582,0.80904,-0.173749,0.420426,0.852266,0.457496
2,0.533465,-2.709336,-2.01901,-0.140028,-1.83855,-0.399043,1.314033,1.073793,1.24,1.407711,...,-0.712627,1.167748,1.521013,1.046217,1.080123,1.6855,-2.432485,0.420426,-0.313993,0.457496
3,0.533465,0.592667,-0.947698,-1.180236,-0.03469,-1.436556,0.193874,0.127399,0.2,0.391031,...,-1.741977,1.167748,-0.339991,-0.041849,1.080123,-0.94388,0.955619,-0.865584,0.852266,-1.808197
4,0.533465,0.592667,-0.947698,-0.140028,-0.93662,-2.474069,0.193874,1.073793,0.2,0.391031,...,-0.712627,-1.868397,-1.270493,-0.041849,-0.92582,-0.06742,-0.173749,-2.151594,-1.480252,-1.808197
5,-0.622376,-2.709336,0.123613,-2.220444,-1.83855,-1.436556,1.314033,1.073793,1.24,1.407711,...,1.346073,2.179797,-0.339991,1.046217,-0.92582,0.80904,-0.173749,0.420426,0.852266,1.590342
6,-0.622376,-0.508001,-0.947698,-1.180236,-0.93662,-1.436556,0.193874,0.127399,0.2,0.391031,...,-1.741977,-0.856349,-0.339991,-1.129915,1.080123,-0.94388,0.955619,0.420426,0.852266,-0.675351
7,0.533465,-0.508001,0.123613,0.90018,0.86724,0.63847,1.314033,1.073793,0.2,0.391031,...,1.346073,-1.868397,-1.270493,-1.129915,-0.92582,-0.94388,-0.173749,0.420426,-1.480252,0.457496
8,1.689306,0.592667,1.194924,1.940388,-0.03469,1.675982,1.314033,1.073793,1.24,0.391031,...,1.346073,0.1557,-0.339991,-0.041849,-0.92582,-0.94388,0.955619,0.420426,-0.313993,1.590342
9,0.533465,1.693335,1.194924,-1.180236,0.86724,0.63847,0.193874,-1.765388,-0.84,0.391031,...,1.346073,1.167748,-0.339991,1.046217,1.080123,-0.06742,-0.173749,0.420426,-0.313993,1.590342


In [None]:
###Dataframes ready for analysis###

### df 
# dataframe where some columns have qualitative values (SQ_SL_When, SQ_SL_Hour, SQ_EXCER, SQ_D_Healthy, SQ_D_Unhealthy)
# contains student information in column

### df_quant
# dataframe where all columns but student information are integers
# contains student information

### df_SQ
# same as df, but student information columns are dropped

### df_quant_SQ
# same as df_quant, but student information columns are dropped
# dataframe that all values are integers
# use it for correlation matrix and other

### df_quant_SQ_STD
# same as df_quant_SQ, but all values are satndardized
# all values have same scale now (mean=0, variance=1)
# good for PCA and other analysis