In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
df = pd.read_stata('./CGSS2017.dta', convert_categoricals=False, convert_missing=False)

In [24]:
#选择被调查者的受教育程度（A7a),性别（A2）,出生年份(A3_1)，政治面貌(A10),工作情况(A58),户籍(A18),14岁时家庭地位(A43_d)
respondent = df.loc[:, ['a7a', 'a2', 'a31', 'a10', 'a58', 'a18', 'a43d']]
#选择被调查者父亲的受教育程度(A89b)，政治面貌(A89c)，工作情况(A89d)，专业职称(A89f)，年龄(A89a)
father = df.loc[:, ['a89b', 'a89c', 'a89d', 'a89f', 'a89a']]
#选择被调查者母亲的受教育程度(A90b)，政治面貌(A90c)，工作情况(A90d)，专业职称(A90f)，年龄(A90a)
mother = df.loc[:, ['a90b', 'a90c', 'a90d', 'a90f', 'a90a']]

In [25]:
respondent.loc[:, 'a7a'].describe().rename('受教育程度')

count    12561.000000
mean         5.165751
std          3.274600
min          1.000000
25%          3.000000
50%          4.000000
75%          6.000000
max         13.000000
Name: 受教育程度, dtype: float64

In [33]:
#数据预处理
respondent['a31'] = 2023 - respondent['a31']  #计算被调查者年龄
father['a89a'] = 2023 - father['a89a']  #计算父亲年龄
mother['a90a'] = 2023 - mother['a90a']  #计算母亲年龄

respondent['a10'] = respondent['a10'].apply(lambda x: 1 if x == 4 else 0)  #被调查者是否是共产党员
father['a89c'] = father['a89c'].apply(lambda x: 1 if x == 4 else 0)  #父亲是否是共产党员
mother['a90c'] = mother['a90c'].apply(lambda x: 1 if x == 4 else 0)  #母亲是否是共产党员

respondent['a58'] = respondent['a58'].apply(lambda x: 1 if x == 2 or x == 4 or x == 3 else 0)  #被调查者是否全职务农
father['a89d'] = father['a89d'].apply(lambda x: 1 if x == 2 or x == 4 or x == 3 else 0)  #父亲是否全职务农
mother['a90d'] = mother['a90d'].apply(lambda x: 1 if x == 2 or x == 4 or x == 3 else 0)  #母亲是否全职务农

father['a89f'] = father['a89f'].apply(lambda x: 0 if x == 1 or x == 1 else 1)  #父亲是否有专业技术职称
mother['a90f'] = mother['a90f'].apply(lambda x: 0 if x == 1 or x == 1 else 1)  #母亲是否有专业技术职称

respondent['a18'] = respondent['a18'].apply(lambda x: 1 if x == 1 else 0)  #被调查者是否是农业户口

0    0
1    0
2    0
3    0
4    0
Name: a58, dtype: int64

In [36]:
#考虑到时代因素,我们将父母辈的技校计入了大学学历
data = {
	'小学及以下': [0, 0, 0, 0],
	'初中': [0, 0, 0, 0],
	'高中': [0, 0, 0, 0],
	'大学及以上': [0, 0, 0, 0]
}
index = ['小学及以下', '初中', '高中', '大学及以上']
father_edu = pd.DataFrame(data=data, index=index)
father_cnt = []
mother_edu = pd.DataFrame(data=data, index=index)
mother_cnt= []
father_mother = pd.DataFrame(data=data, index=index)
for i in range(len(df)):
	if respondent['a7a'][i] == np.nan or father['a89b'][i] == np.nan or mother['a90b'][i] == np.nan:
		continue
	if respondent['a7a'][i] <= 3:

		if father['a89b'][i] <= 3:
			father_edu.at['小学及以下', '小学及以下'] += 1
		elif father['a89b'][i] == 4:
			father_edu.at['小学及以下', '初中'] += 1
		elif father['a89b'][i] <= 7:
			father_edu.at['小学及以下', '高中'] += 1
		else:
			father_edu.at['小学及以下', '大学及以上'] += 1

		if mother['a90b'][i] <= 3:
			mother_edu.at['小学及以下', '小学及以下'] += 1
		elif mother['a90b'][i] == 4:
			mother_edu.at['小学及以下', '初中'] += 1
		elif mother['a90b'][i] <= 7:
			mother_edu.at['小学及以下', '高中'] += 1
		else:
			mother_edu.at['小学及以下', '大学及以上'] += 1

	elif respondent['a7a'][i] == 4:

		if father['a89b'][i] <= 3:
			father_edu.at['初中', '小学及以下'] += 1
		elif father['a89b'][i] == 4:
			father_edu.at['初中', '初中'] += 1
		elif father['a89b'][i] <= 7:
			father_edu.at['初中', '高中'] += 1
		else:
			father_edu.at['初中', '大学及以上'] += 1

		if mother['a90b'][i] <= 3:
			mother_edu.at['初中', '小学及以下'] += 1
		elif mother['a90b'][i] == 4:
			mother_edu.at['初中', '初中'] += 1
		elif mother['a90b'][i] <= 7:
			mother_edu.at['初中', '高中'] += 1
		else:
			mother_edu.at['初中', '大学及以上'] += 1
	elif respondent['a7a'][i] <= 8:

		if father['a89b'][i] <= 3:
			father_edu.at['高中', '小学及以下'] += 1
		elif father['a89b'][i] == 4:
			father_edu.at['高中', '初中'] += 1
		elif father['a89b'][i] <= 7:
			father_edu.at['高中', '高中'] += 1
		else:
			father_edu.at['高中', '大学及以上'] += 1

		if mother['a90b'][i] <= 3:
			mother_edu.at['高中', '小学及以下'] += 1
		elif mother['a90b'][i] == 4:
			mother_edu.at['高中', '初中'] += 1
		elif mother['a90b'][i] <= 7:
			mother_edu.at['高中', '高中'] += 1
		else:
			mother_edu.at['高中', '大学及以上'] += 1

	else:
		if father['a89b'][i] <= 3:
			father_edu.at['大学及以上', '小学及以下'] += 1
		elif father['a89b'][i] == 4:
			father_edu.at['大学及以上', '初中'] += 1
		elif father['a89b'][i] <= 7:
			father_edu.at['大学及以上', '高中'] += 1
		else:
			father_edu.at['大学及以上', '大学及以上'] += 1

		if mother['a90b'][i] <= 3:
			mother_edu.at['大学及以上', '小学及以下'] += 1
		elif mother['a90b'][i] == 4:
			mother_edu.at['大学及以上', '初中'] += 1
		elif mother['a90b'][i] <= 7:
			mother_edu.at['大学及以上', '高中'] += 1
		else:
			mother_edu.at['大学及以上', '大学及以上'] += 1
for i in range(len(df)):
	if df['a89b'][i]<=3:
		father_cnt.append(1)
	elif df['a89b'][i]==4:
		father_cnt.append(2)
	elif df['a89b'][i]<=7:
		father_cnt.append(3)
	else:
		father_cnt.append(4)


	if df['a90b'][i]<=3:
		mother_cnt.append(1)
	elif df['a90b'][i]==4:
		mother_cnt.append(2)
	elif df['a90b'][i]<=7:
		mother_cnt.append(3)
	else:
		mother_cnt.append(4)
father_edu, mother_edu, len(df)

(       小学及以下   初中   高中  大学及以上
 小学及以下   3722  169   58    396
 初中      2568  445  190    308
 高中      1235  515  241    244
 大学及以上    779  667  593    452,
        小学及以下   初中   高中  大学及以上
 小学及以下   4064   60   16    205
 初中      3057  244   49    161
 高中      1618  335  141    141
 大学及以上   1137  591  472    291,
 12582)

In [38]:
father_sum = father_edu.sum(axis=0)
mother_sum = mother_edu.sum(axis=0)
father_percent = father_edu.div(father_sum)*100
mother_percent = mother_edu.div(mother_sum)*100
father_percent, mother_percent

(           小学及以下         初中         高中      大学及以上
 小学及以下  44.821773   9.409800   5.360444  28.285714
 初中     30.924855  24.777283  17.560074  22.000000
 高中     14.872351  28.674833  22.273567  17.428571
 大学及以上   9.381021  37.138085  54.805915  32.285714,
            小学及以下         初中         高中      大学及以上
 小学及以下  41.150263   4.878049   2.359882  25.689223
 初中     30.953827  19.837398   7.227139  20.175439
 高中     16.383151  27.235772  20.796460  17.669173
 大学及以上  11.512758  48.048780  69.616519  36.466165)

In [37]:
from scipy.optimize import leastsq


def fun(p, x):
	fa, mo = x
	a, b, c = p
	return a * fa + b * mo + c


def err(p, x, y):
	return fun(p, x) - y


Y = df['a7a']
X = [father_cnt, mother_cnt]
p0 = [1, 1, 1]
Para = leastsq(err, p0, args=(X, Y))
y_fitted = fun(Para[0], X)

plt.figure(figsize=(8, 6))
plt.scatter(X, y_fitted, marker='o', color='blue', label='Fitted')
plt.scatter(X, Y, marker='x', color='red', label='True')
plt.legend(loc=2)
plt.show()

ValueError: operands could not be broadcast together with shapes (25164,) (12582,) 