In [1]:
"""
这个脚本用来对比dedup/k12/release-20240523/KSJ_XES和replace_data_3
作为展示如果数据结构不一致的情况下的处理方法
"""
import os
import json
from tqdm import tqdm
import difflib
from concurrent.futures import ThreadPoolExecutor, as_completed
from difflib import SequenceMatcher

dir1 = "/fs-computility/llm/shared/hongjiawei/zk_train_internlm/datasets/cn_exam/dedup/k12/release-20240523/KSJ_XES"
dir2 = "/fs-computility/llm/shared/huhanglei/math_equation/data_filter/replace_data_3"


def compare_fields(data1_content, data2_content):
	differences = {}
	if data1_content['question_list'] != data2_content['question_list']:
		differences['question_list'] = {
			'dir1': data1_content['question_list'],
			'dir2': data2_content['question_list'],
			'diff': list(difflib.unified_diff(
				data1_content['question_list'],
				data2_content['question_list'],
				lineterm=''
			))
		}
	if data1_content['q_main'] != data2_content['q_main']:
		differences['q_main'] = {
			'dir1': data1_content['q_main'],
			'dir2': data2_content['q_main'],
			'diff': list(difflib.unified_diff(
				data1_content['q_main'],
				data2_content['q_main'],
				lineterm=''
			))
		}
	return differences


def read_jsonl(file_path, progress_bar):
	"""
	Read jsonl file and update the progress bar.
	:param file_path: Path to the jsonl file.
	:param progress_bar: tqdm progress bar instance.
	:return: List of data from the jsonl file.
	"""
	data = []
	with open(file_path, 'r', encoding='utf-8') as file:
		for line in file:
			if 'null\n' not in line:
				data.append(json.loads(line))
			progress_bar.update(1)
	return data


def read_all_jsonl(dir, workers=10):
	"""
	Read all jsonl files in a directory using multiple threads.
	:param dir: Directory containing jsonl files.
	:param workers: Number of worker threads.
	:return: List of data from all jsonl files.
	"""
	files = [os.path.join(dir, f) for f in os.listdir(dir) if f.endswith("jsonl")]
	total_lines = sum(sum(1 for _ in open(file, 'r', encoding='utf-8')) for file in files)
	data = []

	with tqdm(total=total_lines, desc="Reading JSONL files", unit="line") as progress_bar:
		with ThreadPoolExecutor(max_workers=workers) as executor:
			future_to_file = {executor.submit(read_jsonl, file, progress_bar): file for file in files}
			for future in as_completed(future_to_file):
				data.extend(future.result())

	return data



In [2]:
data1 = read_all_jsonl(dir1, workers=30)
data2 = read_all_jsonl(dir2, workers=30)

Reading JSONL files: 100%|██████████| 4618909/4618909 [06:56<00:00, 11093.57line/s]
Reading JSONL files:  39%|███▉      | 2489862/6343419 [02:41<1:12:53, 881.10line/s] IOStream.flush timed out
Reading JSONL files: 100%|██████████| 6343419/6343419 [07:52<00:00, 13412.29line/s]  


In [4]:
# 初始化变量
exam_problem_id1 = []
exam_problem_id2 = []
exam_problem2data1 = {}
exam_problem2data2 = {}
# 单次遍历完成所有初始化操作
for d in tqdm(data1):
	key = f"{d['id']}"
	exam_problem_id1.append(key)
	exam_problem2data1[key] = d

for d in tqdm(data2):
	key = f"{d['exam_id']}_{d['problem_id']}"
	exam_problem_id2.append(key)
	exam_problem2data2[key] = d
output_path = "output/diff_xes_clean_0524.jsonl"
# 交集
intersection = set(exam_problem_id1) & set(exam_problem_id2)

100%|██████████| 4618909/4618909 [00:05<00:00, 921351.23it/s]
100%|██████████| 6343419/6343419 [00:08<00:00, 790237.77it/s]


In [5]:
intersection_list = list(intersection)
test_intersection = intersection_list[:100]
len(intersection_list)

3848141

In [36]:
# def remove_symbols(text):
# 	# 移除所有非字母和非数字字符
# 	text = re.sub(r'\$\$', '$', text)
# 	#remove 两个$之间的
# 	text = re.sub(r'\$[^\$]*\$', '', text)
# 	# text = re.sub(r'\\\\[a-zA-Z]+\\\\', '', text)
# 	text = text.replace('_', '')
# 	text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)
# 
# 	return text
# i = "d7291b38e95b4a39892e95e1a6c82f62_28"
# data1_content = exam_problem2data1[i]
# data2_content = exam_problem2data2[i]
# q_main_list1 = get_q_main_for_xes_clean(data1_content)
# q_main_list2 = get_q_main(data2_content)
# print(q_main_list1, q_main_list2)
# text1 = remove_symbols(q_main_list1)
# text2 = remove_symbols(q_main_list2)
# print(text1, text2)

已知二次函数$$y=m{{x}^{2}}-2mx-3$$（$$m$$是实数）．甲求得当$$x=1$$时，$$y=1$$；乙求得当$$x=\frac{1}{2}$$时，$$y=\frac{1}{2}$$．若甲求得的结果正确，你认为乙求得的结果正确吗？说明理由．写出二次函数的对称轴，并求出该函数的最值（用含$$m$$的代数式表示）．图象上有两点$$P\left( {{x}_{1}},p \right)$$和$$Q\left( {{x}_{2}},q \right)$$，若$${{x}_{1}} <1< {{x}_{2}}$$，$${{x}_{1}}+{{x}_{2}}>2$$，试比较$$p$$和$$q$$的大小关系． 已知二次函数$y = mx^2 - 2mx - 3$（$m$是实数）．甲求得当$x = 1$时，$y = 1$；乙求得当$x = \frac{1}{2}$时，$y = \frac{1}{2}$．若甲求得的结果正确，你认为乙求得的结果正确吗？说明理由．写出二次函数的对称轴，并求出该函数的最值（用含$m$的代数式表示）．图象上有两点$Pleft( {{x}_{1}},p \right)$和$Qleft( {{x}_{2}},q \right)$，若${{x}_{1}} <1< {{x}_{2}}$，${{x}_{1}} + {{x}_{2}} > 2$，试比较$p$和$q$的大小关系．
已知二次函数是实数甲求得当时乙求得当时若甲求得的结果正确你认为乙求得的结果正确吗说明理由写出二次函数的对称轴并求出该函数的最值用含的代数式表示图象上有两点和若试比较和的大小关系 已知二次函数是实数甲求得当时乙求得当时若甲求得的结果正确你认为乙求得的结果正确吗说明理由写出二次函数的对称轴并求出该函数的最值用含的代数式表示图象上有两点和若试比较和的大小关系


In [46]:
import difflib
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import re
from copy import deepcopy


def ensure_output(output_path):
	# Initialize output file
	if output_path:
		# if not os.path.exists(output_path):
		# os.makedirs(os.path.dirname(output_path))
		# touch output_path
		with open(output_path, 'w', encoding='utf-8') as file:
			pass


def remove_symbols(text):
	# # 移除所有非字母和非数字字符
	# text = re.sub(r'\$\$', '$', text)
	# #remove 两个$之间的
	# text = re.sub(r'\$[^\$]*\$', '', text)
	# # text = re.sub(r'\\\\[a-zA-Z]+\\\\', '', text)
	# text = text.replace('_', '')
	# text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)
	pattern = re.compile(r'[\u4e00-\u9fa5]+')
	return ''.join(pattern.findall(text))



def get_q_main_for_xes_clean(content):
	question_list = content['ori']['question_list']
	q_main_str = content['ori']['q_main']
	options = content['ori']['options']
	res = q_main_str
	for question in question_list:
		res += question
	for option in options:
		res += option
	return res


def get_q_main(content):
	question_list = content['question_list']
	q_main_str = content['q_main']
	options = content['options']
	res = q_main_str
	for question in question_list:
		res += question
	for option in options:
		res += option
	return res


def calculate_similarity_and_diff(content1, content2):
	# Extract q_main fields
	q_main_list1 = get_q_main_for_xes_clean(content1)
	q_main_list2 = get_q_main(content2)

	# Join q_main fields to form a single string for comparison
	text1 = deepcopy(q_main_list1)
	text2 = deepcopy(q_main_list2)
	# print(text1, text2)
	# Remove symbols from text
	text1 = remove_symbols(text1)
	text2 = remove_symbols(text2)
	# print(text1, text2)
	# Calculate similarity
	similarity = difflib.SequenceMatcher(None, text1, text2).ratio()

	# Calculate differences
	diff = list(difflib.unified_diff(text1.splitlines(), text2.splitlines(), lineterm=''))

	return similarity, diff, q_main_list1, q_main_list2, text1, text2


def process_intersection(intersection_id, visited, output_path):
	if intersection_id in visited:
		return None

	data1_content = exam_problem2data1[intersection_id]
	data2_content = exam_problem2data2[intersection_id]

	similarity, diff, q_main1, q_main2, text1, text2 = calculate_similarity_and_diff(data1_content, data2_content)
	if similarity < 0.9 and len(text1)<len(text2):
		res_dict = {
			'exam_id': data2_content['exam_id'],
			'problem_id': data2_content['problem_id'],
			'similarity': similarity,
			'diff': diff,
			'q_main1': q_main1,
			'q_main2': q_main2,
			'text1': text1,
			'text2': text2,
			"data1": data1_content,
			"data2": data2_content
		}

		with open(output_path, 'a', encoding='utf-8') as file:
			file.write(json.dumps(res_dict, ensure_ascii=False) + '\n')

		return res_dict

	return None


# Ensure output file exists
# output_path = 'output.jsonl'
# os.makedirs(os.path.dirname(output_path), exist_ok=True)
# # Touch output file
# open(output_path, 'w').close()

visited = set()
all_differences = []
ensure_output(output_path)
# Load already visited IDs from output file if exists
if os.path.exists(output_path):
	with open(output_path, 'r', encoding='utf-8') as file:
		for line in tqdm(file):
			res_dict = json.loads(line)
			# print(res_dict)
			exam_id = res_dict["data1"]['exam_id']
			problem_id = res_dict["data1"]['problem_id']
			visited.add(f"{exam_id}_{problem_id}")

# Use ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
	futures = {executor.submit(process_intersection, intersection_id, visited, output_path): intersection_id for
			   intersection_id in intersection_list}

	for future in tqdm(as_completed(futures), total=len(futures)):
		result = future.result()
		if result is not None:
			all_differences.append(result)

# Optionally do something with all_differences list


0it [00:00, ?it/s]
100%|██████████| 3848141/3848141 [00:15<00:00, 242872.02it/s]


In [49]:
import os
import json
from collections import Counter, defaultdict
from tqdm import tqdm

example_diff_path = f"example_diff_{os.path.basename(output_path)}.jsonl"
# output_path = "output/diff.jsonl"
result_example_diff = []

if os.path.exists(output_path):
	counter = Counter()
	example_diff_dict = defaultdict(list)

	with open(output_path, 'r', encoding='utf-8') as file:
		record_count = 0
		for line in tqdm(file):
			# if record_count >= 10000:  # 检查是否已达到最大记录数
			# 	break
			record_count += 1
			res_dict = json.loads(line)
			dd = res_dict['data2']
			subject = dd['major']
			q_type = dd['q_type']
			counter[(subject, q_type)] += 1
			example_diff_dict[(subject, q_type)].append(res_dict)

	# Collect the first example of each (subject, q_type) combination
	for key, value in example_diff_dict.items():
		result_example_diff.append(value[0])  # Assuming you want the first occurrence
		print(key)
	# Optionally, save the result to a file
	with open(example_diff_path, 'w', encoding='utf-8') as outfile:
		for item in result_example_diff:
			outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

	print("Processing completed and results saved to", example_diff_path)
	print("Counter results:", counter)



1254it [00:00, 7520.87it/s]

('英语', '完形填空题')
('数学', '填空题')
('语文', '名著阅读题')
('历史', '材料题')
('语文', '文言文题')
('数学', '解答题')
('语文', '综合性学习题')
('语文', '现代文阅读题')
('政治', '材料题')
('英语', '阅读理解题')
('英语', '语法填空题')
('化学', '填空题')
('语文', '基础知识题')
('化学', '简答题')
('数学', '单选题')
('物理', '解答题')
('物理', '计算题')
('语文', '古代诗歌题')
('英语', '单选题')
('英语', '补全对话题')
('化学', '单选题')
('英语', '简答题')
('生物', '单选题')
('英语', '填空题')
('语文', '默写题')
('数学', '选择题')
('英语', '信息匹配题')
('数学', '多选题')
('数学', '计算题')
('英语', '改错题')
('英语', '书面表达题')
('历史', '填空题')
('地理', '材料题')
('物理', '填空题')
('英语', '适当形式填空题')
('物理', '单选题')
('英语', '句型转换题')
('英语', '多选题')
('历史', '单选题')
('地理', '单选题')
('科学', '填空题')
('科学', '解答题')
('英语', '翻译题')
('英语', '首字母填空题')
('政治', '填空题')
('英语', '语法选择题')
('英语', '排序题')
('科学', '单选题')
('英语', '完成句子题')
('政治', '单选题')
Processing completed and results saved to example_diff_diff_xes_clean_0524.jsonl.jsonl
Counter results: Counter({('数学', '解答题'): 324, ('历史', '材料题'): 173, ('政治', '材料题'): 120, ('数学', '填空题'): 80, ('英语', '语法填空题'): 61, ('语文', '现代文阅读题'): 60, ('英语', '阅读理解题'): 60, ('化学',


