In [6]:
import pandas as pd 
import numpy as np

# 设置随机种子以确保可复现性
np.random.seed(42)

# 变量定义
num_users = 183  # 用户总数
total_records = 4459  # 总记录数
division1_5_users = 131  # division 1-5 的用户数
division1_5_records = 3273  # division 1-5 的总记录数

# 生成用户ID（从1开始编号）
reporter_ids = np.arange(1, num_users + 1)

# 重新分配记录以确保总数匹配
records_per_user = np.random.randint(12, 37, size=num_users)
while records_per_user.sum() != total_records:
    diff = total_records - records_per_user.sum()
    adjust_idx = np.random.randint(0, num_users, size=abs(diff))
    records_per_user[adjust_idx] += np.sign(diff)

# 生成用户 ID 序列
reporter_id_series = np.concatenate([np.full(n, rid) for rid, n in zip(reporter_ids, records_per_user)])
years, months = [], []

for n in records_per_user:
    start_year = 2004
    for i in range(n):
        year = start_year + (i // 12)
        month = (i % 12) + 1
        years.append(year)
        months.append(month)

# 生成其他变量
genders = np.random.choice([0, 1], size=num_users, p=[0.40, 0.60])
educations = np.random.choice([0, 1], size=num_users, p=[0.17, 0.83])
party_members = np.random.choice([0, 1], size=num_users, p=[0.53, 0.47])

ages = np.random.normal(loc=32.8, scale=5, size=num_users).astype(int)
ages = np.clip(ages, 22, 57)
age_series = np.concatenate([np.full(n, age) + np.arange(n) // 12 for age, n in zip(ages, records_per_user)])

tenures = np.random.normal(loc=8.2, scale=3, size=num_users)
tenures = np.clip(tenures, 1, 27)
tenure_series = np.concatenate([np.full(n, tenure) + np.arange(n) / 12 for tenure, n in zip(tenures, records_per_user)])

positions = np.random.choice([1, 2, 3], size=total_records, p=[0.5, 0.3, 0.2])
qualifications = np.random.choice([1, 2, 3], size=total_records, p=[0.4, 0.4, 0.2])

# 生成 Division，确保 division 1-5 具有 131 个 ID，且总行数为 3273
divisions = np.random.choice(range(6, 9), size=num_users)  # 默认分配到 division 6-8

division1_5_indices = np.random.choice(num_users, division1_5_users, replace=False)
divisions[division1_5_indices] = np.random.choice(range(1, 6), size=division1_5_users)

division_series = np.concatenate([np.full(n, div) for div, n in zip(divisions, records_per_user)])

# 文章相关变量
num_articles = np.clip(np.random.normal(loc=32.5, scale=21.56, size=total_records).astype(int), 1, 241)
num_words = np.clip(np.random.normal(loc=18356, scale=13233, size=total_records).astype(int), 230, 144280)
quantity_scores = np.clip(np.random.normal(loc=2079, scale=1274, size=total_records).astype(int), 140, 14850)
quality_scores = np.clip(np.random.normal(loc=1476, scale=1097, size=total_records).astype(int), 0, 12300)
article_types = np.random.randint(0, 10, size=(total_records, 8))

# 构造 DataFrame
df = pd.DataFrame({
    "Reporter_ID": reporter_id_series,
    "Year": years[:total_records],
    "Month": months[:total_records],
    "Gender": np.repeat(genders, records_per_user)[:total_records],
    "Education": np.repeat(educations, records_per_user)[:total_records],
    "Party_member": np.repeat(party_members, records_per_user)[:total_records],
    "Age": age_series[:total_records],
    "Tenure": tenure_series[:total_records],
    "Position": positions,
    "Qualification": qualifications,
    "Division": division_series[:total_records],
    "#Articles": num_articles,
    "#Words": num_words,
    "QuantityScore": quantity_scores,
    "QualityScore": quality_scores,
    "#Investigative": article_types[:, 0],
    "#Feature": article_types[:, 1],
    "#Special": article_types[:, 2],
    "#Advertising": article_types[:, 3],
    "#Propaganda": article_types[:, 4],
    "#AssignedWithEditor": article_types[:, 5],
    "#ColumnByContent": article_types[:, 6],
    "#ArticleWithExternalAuthors": article_types[:, 7],
    "#Coauthorearticle": article_types[:, 7]
})

# 保存为 CSV 文件
csv_path = "adjusted_panel_data.csv"
df.to_csv(csv_path, index=False)

csv_path



'adjusted_panel_data.csv'