# Water Quality, Perception and Knowledge in China: *Data Analysis (condensed)*

More information about the thesis, motivation, and methodology is located in the main "thesis_analysis.ipynb" document.

## Load Data

Load Python libraries

In [106]:
import pandas as pd
from pandas import DataFrame
import matplotlib as mpl
import seaborn as sns #for plots
import statsmodels.api as sm #for statistical analysis
import statsmodels.formula.api as smf #for statistical analysis

In [107]:
# Returns ALL columns when displaying DataFrame, useful for finding column names
pd.set_option('display.max_columns', None)

Load CGSS (social survey) data from a Stata file

In [108]:
cgss = pd.read_stata('../data/cgss2010_12.dta', preserve_dtypes = True, convert_categoricals=False)

List categorical data

In [109]:
categoricals = ["s41","a2","a91","l1a","l1b","l7a","l7b","l2409"]

List important questions

In [110]:
important = ['score','s41','a2','a3a','a3b','a3c','a7a','a8a','a15','a62','a91','a92','l1a','l1b','l6a','l7a','l7b','l8a','l8b','l12a','l12b','l12c','l137','l14d','l15a','l15b','l16c','l20e','l2409']

Convert categorical data into categorical data types

In [111]:
cgss[categoricals].astype('category')

Unnamed: 0,s41,a2,a91,l1a,l1b,l7a,l7b,l2409
0,28.0,2,1.0,,,,,
1,28.0,2,1.0,,,,,
2,28.0,2,2.0,,,,,
3,28.0,2,2.0,8.0,2.0,98.0,98.0,8.0
4,28.0,1,2.0,,,,,
...,...,...,...,...,...,...,...,...
11778,1.0,1,2.0,,,,,
11779,1.0,1,2.0,,,,,
11780,1.0,1,2.0,,,,,
11781,1.0,1,2.0,2.0,4.0,10.0,98.0,8.0


Load province data from a .csv, set province code as the index

In [112]:
provinces = pd.read_csv('prov.csv')

Load water quality data

In [113]:
wqir = pd.read_csv('../data/wqir2018_zh.csv', sep=' ', encoding = "UTF-8")

---

## Merge data into one dataframe

Group the water quality data (WQIR) by province and compute the mean

In [114]:
wqir_mean = wqir.groupby(by='province').agg('mean')

Merge the mean water quality per province and the province dataframe (matching names and province codes)

In [115]:
merge = pd.merge(wqir_mean, provinces, on='province')

Drop rank column, merge the previously merged column into the main cgss dataframe so that each entry has the mean water score from their province, plus the names of their province (Chinese short and full and English).

In [116]:
wq = merge[['s41','score','province','province_full','province_en']]
cgss_wq = pd.merge(cgss,wq,on='s41')

---

## Statistical analysis

In [117]:
ols_perception = smf.ols('l2409 ~ score + a2 + a3a + a7a + a91', data = cgss_wq).fit()
ols_perception.summary()

0,1,2,3
Dep. Variable:,l2409,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,17.31
Date:,"Tue, 02 Mar 2021",Prob (F-statistic):,5.64e-17
Time:,12:04:13,Log-Likelihood:,-9049.1
No. Observations:,3671,AIC:,18110.0
Df Residuals:,3665,BIC:,18150.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.2982,2.615,0.496,0.620,-3.828,6.425
score,0.0287,0.011,2.700,0.007,0.008,0.050
a2,0.2350,0.095,2.462,0.014,0.048,0.422
a3a,0.0025,0.001,1.894,0.058,-8.85e-05,0.005
a7a,-0.1321,0.018,-7.419,0.000,-0.167,-0.097
a91,-0.0466,0.107,-0.436,0.663,-0.256,0.163

0,1,2,3
Omnibus:,618.569,Durbin-Watson:,1.687
Prob(Omnibus):,0.0,Jarque-Bera (JB):,990.131
Skew:,-1.271,Prob(JB):,9.909999999999999e-216
Kurtosis:,3.124,Cond. No.,109000.0


In [118]:
ols_knowledge = smf.ols('l2409 ~ score + a2 + a3a + a7a + a91', data = cgss_wq).fit()
ols_knowledge.summary()

0,1,2,3
Dep. Variable:,l2409,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,17.31
Date:,"Tue, 02 Mar 2021",Prob (F-statistic):,5.64e-17
Time:,12:04:13,Log-Likelihood:,-9049.1
No. Observations:,3671,AIC:,18110.0
Df Residuals:,3665,BIC:,18150.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.2982,2.615,0.496,0.620,-3.828,6.425
score,0.0287,0.011,2.700,0.007,0.008,0.050
a2,0.2350,0.095,2.462,0.014,0.048,0.422
a3a,0.0025,0.001,1.894,0.058,-8.85e-05,0.005
a7a,-0.1321,0.018,-7.419,0.000,-0.167,-0.097
a91,-0.0466,0.107,-0.436,0.663,-0.256,0.163

0,1,2,3
Omnibus:,618.569,Durbin-Watson:,1.687
Prob(Omnibus):,0.0,Jarque-Bera (JB):,990.131
Skew:,-1.271,Prob(JB):,9.909999999999999e-216
Kurtosis:,3.124,Cond. No.,109000.0
