In [2]:
import unittest
import pandas as pd
import tempfile
import os
import shutil
import re

In [3]:
# 导入需要测试的函数
def isLegal(text):
    # 检查非 ASCII 字符比例
    non_ascii_count = sum(1 for char in text if ord(char) >= 128)
    non_ascii_ratio = non_ascii_count / len(text) if text else 0
    if non_ascii_ratio > 0.5:  # 非 ASCII 字符比例超过 50% 认为可能是乱码
        return True

    # 检查无效的 Unicode 字符组合
    # 这里简单检查是否有连续的代理对（无效的 Unicode 组合）
    if re.search(r'[\ud800-\udbff][\udc00-\udfff]{2,}', text):
        return True

    # 检查是否有孤立的控制字符
    if re.search(r'[\x00-\x1f](?![\x00-\x1f])', text):
        return True

    return False


def data_cleaning(input_file_path, output_file_path):
    # 加载数据
    data = pd.read_csv(input_file_path)

    # 去除重复行
    data = data.drop_duplicates()

    # 检查并处理乱码（更新内容描述和 Update Type）
    for col in ['Update Content', 'Update Type']:
        data = data[~data[col].apply(isLegal)]

    # 检查 View Count 的值是否大于 0
    data = data[data['View Count'] > 0]

    # 检查和修正 Update Time 列的数据格式为日期格式
    try:
        data['Update Time'] = pd.to_datetime(data['Update Time'])
    except ValueError as e:
        print(f"日期转换错误: {e}")

    # 保存清洗后的数据
    data.to_csv(output_file_path, index=False)

input_file_path = 'D:/大三上/开源软件/大作业/issues_data.csv'
output_file_path = 'D:/大三上/开源软件/大作业/issues_data_cleaned.csv'
data_cleaning(input_file_path, output_file_path)

In [6]:
class TestDataCleaning(unittest.TestCase):
    def setUp(self):
        # 创建一个临时目录用于存储测试文件
        self.test_dir = tempfile.mkdtemp()
        self.input_file_path = os.path.join(self.test_dir, 'test_input.csv')
        self.output_file_path = os.path.join(self.test_dir, 'test_output.csv')
        # 创建测试数据，用以检测isLegal和data_cleaning函数是否正常执行
        self.test_data = pd.DataFrame({
            'Update Content': ['File.match? has an issue with paths', 'Reduce CI usage', 'Truncated quotient and remainder'],
            'Update Type': ['kind:bug', 'topic:compiler:semantic', 'platform:darwin'],
            'View Count': [15218, 14582, 13899],
            'Update Time': ['2025-01-17', '2025-01-06', 'invalid_date', ]
        })
        self.test_data.to_csv(self.input_file_path, index=False)

    def tearDown(self):
        # 清理临时目录和文件
        shutil.rmtree(self.test_dir)

    def test_isLegal(self):
        # 测试正常文本
        self.assertFalse(isLegal('normal text'))
        # 测试高比例非 ASCII 字符
        self.assertTrue(isLegal('乱码数据' + '\u0080' * 100))
        # 测试无效的 Unicode 组合
        self.assertTrue(isLegal('无效Unicode组合\uD834\uDF06\uD834\uDF06\uD834\uDF06'))
        # 测试孤立的控制字符
        self.assertTrue(isLegal('控制字符\x01normal'))
        # 测试空字符串
        self.assertFalse(isLegal(''))
        # 测试 None
        #self.assertFalse(isLegal(None))

    def test_data_cleaning(self):
        # 调用数据清洗函数
        data_cleaning(self.input_file_path, self.output_file_path)
        # 检查文件是否存在
        self.assertTrue(os.path.exists(self.output_file_path))
        # 读取清洗后的数据
        cleaned_data = pd.read_csv(self.output_file_path)
        # 检查数据行数是否正确
        self.assertEqual(len(cleaned_data), 2)
        # 检查 View Count 是否大于 0
        self.assertTrue(all(cleaned_data['View Count'] > 0))
        # 检查 Update Time 列的数据格式
        self.assertTrue(pd.api.types.is_datetime64_any_dtype(cleaned_data['Update Time']))
        # 检查乱码数据是否被过滤
        self.assertNotIn('乱码数据' + '\u0080' * 100, cleaned_data['Update Content'].tolist())
        self.assertNotIn('无效Unicode组合\uD834\uDF06\uD834\uDF06\uD834\uDF06', cleaned_data['Update Content'].tolist())
        self.assertNotIn('控制字符\x01normal', cleaned_data['Update Content'].tolist())



In [7]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

FE
ERROR: test_isLegal (__main__.TestDataCleaning.test_isLegal)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\heisenlindangao\AppData\Local\Temp\ipykernel_3664\36500686.py", line 32, in test_isLegal
    self.assertFalse(isLegal(None))
                     ^^^^^^^^^^^^^
  File "C:\Users\heisenlindangao\AppData\Local\Temp\ipykernel_3664\816911571.py", line 4, in isLegal
    non_ascii_count = sum(1 for char in text if ord(char) >= 128)
                                        ^^^^
TypeError: 'NoneType' object is not iterable

FAIL: test_data_cleaning (__main__.TestDataCleaning.test_data_cleaning)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\heisenlindangao\AppData\Local\Temp\ipykernel_3664\36500686.py", line 42, in test_data_cleaning
    self.assertEqual(len(cleaned_data), 2)
AssertionError: 3 != 2

--------------------------------------

日期转换错误: time data "invalid_date" doesn't match format "%Y-%m-%d", at position 2. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
