In [None]:
import json
import os, sys
import numpy as np
import pandas as pd
from langdetect import detect_langs

import matplotlib.pyplot as plt

import seaborn as sns
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

In [None]:
train_path = os.path.join('..', 'dataset', 'train.jsonl')
test_path = os.path.join('..', 'dataset', 'test.jsonl')

In [None]:
# 读取数据
df_train = pd.read_json(train_path, lines=True)
# 展示数据
df_train

In [None]:
df_test = pd.read_json(test_path, lines=True)
df_test

In [None]:
# 1. 统计数据中的语言分布
def lang_detect(text):
    try:
        return detect_langs(text)[0].lang
    except:
        return 'unknown'
    
# 测试数据语言分布，柱状图
df_test['lang'] = df_test['text'].parallel_apply(lang_detect)
# 柱状图，在顶部加上百分比
def plot_lang_distribution(df):
    lang_counts = df['lang'].value_counts()
    lang_counts.plot(kind='bar')
    for i, count in enumerate(lang_counts):
        plt.text(i, count, f'{count}', ha='center', va='bottom')
    plt.title('Language Distribution')
    plt.xlabel('Language')
    plt.ylabel('Count')
    plt.show()
plot_lang_distribution(df_test)

# 训练数据语言分布，柱状图
df_train['lang'] = df_train['text'].parallel_apply(lang_detect)
# 柱状图，在顶部加上百分比
plot_lang_distribution(df_train)

In [None]:
# 2. 统计训练数据长度分布
def plot_length_distribution(df, prefix='train'):
    # 计算文本单词长度
    df['length'] = df['text'].str.split().str.len()
    # 绘制直方图
    plt.figure(figsize=(10, 6))
    sns.histplot(df['length'], bins=100, kde=True)
    plt.title(f'{prefix.capitalize()} Length Distribution')
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.show()
   

plot_length_distribution(df_test, prefix='test')
plot_length_distribution(df_train, prefix='train')

# 训练集合中标签为1的长度分布
df_train_llm = df_train[df_train['label'] == 1]
df_train_human = df_train[df_train['label'] == 0]

plot_length_distribution(df_train_llm, prefix='train_llm')
plot_length_distribution(df_train_human, prefix='train_human')