# Data Cleaning

In [35]:
import pandas as pd
from datetime import datetime

## NLP cleaning

In [None]:
true = pd.read_csv("output/modeling_data_true_final.csv")
false = pd.read_csv("output/modeling_data_false_final.csv")

In [None]:
def parse_chinese_number(s):
    if pd.isna(s):
        return None
    s = s.replace('+', '')
    if '亿' in s:
        return float(s.replace('亿', '')) * 1e8
    elif '万' in s:
        return float(s.replace('万', '')) * 1e4
    else:
        # fallback for plain numbers like '1234'
        return float(s)
    
def parse_chinese_date(s):
    if '年' not in s:
        s = "2025年"+s
    s = s.replace("今天","05月15日").replace(" ","")    # handle "今天xx:xx"
    s = s.strip()
    format = "%Y年%m月%d日%H:%M"
    try:
        output = datetime.strptime(s, format)
    except Exception as e:
        print (f"Cannot parse date {s} for {e}")
    return output 

def clean_post_data(df):
    # clean post_date: yy-mm-dd h-m format
    df["parsed_post_date"] = df['post_date'].apply(parse_chinese_date)
    
    # clean forward, comment, like to numbers
    df['parsed_forward'] = df['forward'].apply(lambda x: parse_chinese_number(x) if x != "转发" else 0)
    df['parsed_comment'] = df['comment'].apply(lambda x: parse_chinese_number(x) if x != "评论" else 0)
    df['parsed_like'] = df['like'].apply(lambda x: parse_chinese_number(x) if x != "赞" else 0)


    # clean follower numbers
    df['parsed_follower'] = df['follower_num'].apply(parse_chinese_number)
    
    # whether it contains the exact topic
    df['exact_topic'] = [
    (str(t) in str(x)) if pd.notna(t) and pd.notna(x) else False
    for t, x in zip(df["topic"], df["text"])]

    if "account_url" in df.columns:
        df = df.drop(columns = ["account_link"])

    return df

In [41]:
true_cleaned = clean_post_data(true)
true_cleaned.head()

Unnamed: 0,account_name,post_date,forward,comment,like,text,image,video,vip,topic,...,subject,referred,follower_num,account_url,parsed_forward,parsed_comment,parsed_like,parsed_follower,exact_topic,parsed_post_date
0,央视新闻,05月06日 18:24,500,311,1835,#女生卖账号被用于色情诈骗#【转发！警惕！#央视曝光兼职注册APP账号猫腻#】“你想月入过万...,False,True,True,央视曝光兼职注册APP账号猫腻,...,social,True,1.37亿,//weibo.com/2656274875,500.0,311.0,1835.0,137000000.0,True,2025-05-06 18:24:00
1,视点视频,05月06日 21:16,10,13,56,#女生卖账号被用于色情诈骗# 近年来，女性网络账户遭犯罪团伙收购用于色情诈骗，引发广泛关注。...,False,True,True,央视曝光兼职注册APP账号猫腻,...,social,True,14.3万,//weibo.com/1116275530,10.0,13.0,56.0,143000.0,True,2025-05-06 21:16:00
2,大皖新闻,05月06日 20:07,50,52,117,#女生卖账号被用于色情诈骗#【转发！警惕！#央视曝光兼职注册APP账号猫腻#】“你想月入过万...,False,True,True,央视曝光兼职注册APP账号猫腻,...,social,True,397.1万,//weibo.com/1751714412,50.0,52.0,117.0,3971000.0,True,2025-05-06 20:07:00
3,头条校园,02月01日 10:30,3,1,2,【#兼职大学生被套走信息案发才知情##兼职日薪170竟是套个人信息抢茅台#】据@荔枝新闻 报...,False,True,True,央视曝光兼职注册APP账号猫腻,...,social,True,353.8万,//weibo.com/2306405891,3.0,1.0,2.0,3538000.0,False,2025-02-01 10:30:00
4,济宁检察,05月12日 08:47,10,评论,9,#帮别人实名注册的账号可能用于犯罪#【这类兼职可能会把你送进监狱！】#国是说法#近日，#女生...,False,True,False,央视曝光兼职注册APP账号猫腻,...,social,True,1.2万,//weibo.com/5244349500,10.0,0.0,9.0,12000.0,False,2025-05-12 08:47:00


In [None]:
false = false.dropna()
false_cleaned = clean_post_data(false)
false_cleaned.head()

Unnamed: 0,account_name,post_date,forward,comment,like,text,image,video,vip,topic,...,subject,referred,follower_num,account_url,parsed_post_date,parsed_forward,parsed_comment,parsed_like,parsed_follower,exact_topic
1,一个饭团_,05月15日11:00,2003,1025,2006,CoCo的鲜百香双响炮真是我大学时的白月光！是谁一到夏天就靠这杯续命呀\n\n百香果的酸甜果...,True,False,True,解放军仪仗队亮相越南阅兵,...,politics,False,124.5万,//weibo.com/3560602223,2025-05-15 11:00:00,2003.0,1025.0,2006.0,1245000.0,False
2,追星人生赢家,05月15日11:00,2330,405,7212,#内娱第一款狐狼CP# 有这么新类型的狐狼CP进入内娱，是观众的福气！看惯了小白兔和大灰狼，...,False,True,True,解放军仪仗队亮相越南阅兵,...,politics,False,416.2万,//weibo.com/2695377883,2025-05-15 11:00:00,2330.0,405.0,7212.0,4162000.0,False
3,於食记,05月15日11:00,2226,1268,6293,#CoCo一亿杯买一送一##夏日必喝宝藏饮品#毫不违心的说，CoCo的鲜百香双响炮我阔以喝一...,True,False,True,解放军仪仗队亮相越南阅兵,...,politics,False,129.9万,//weibo.com/5689119807,2025-05-15 11:00:00,2226.0,1268.0,6293.0,1299000.0,False
4,一珂草莓,05月15日11:00,6067,3579,10271,#CoCo一亿杯买一送一# CoCo都可28周年庆「快乐当夏生日趴」来啦‼️要说夏天！怎么可...,True,False,True,解放军仪仗队亮相越南阅兵,...,politics,False,624.9万,//weibo.com/6583929199,2025-05-15 11:00:00,6067.0,3579.0,10271.0,6249000.0,False
5,湖南卫视歌手,05月15日11:00,247,1054,28541,#歌手2025直播陪看有惊喜#\n歌手首期现场抢先听！\n嘘 别告诉TA们@林志炫Terry...,False,True,True,解放军仪仗队亮相越南阅兵,...,politics,False,434.7万,//weibo.com/3166262701,2025-05-15 11:00:00,247.0,1054.0,28541.0,4347000.0,False


In [58]:
true_cleaned.columns

Index(['account_name', 'post_date', 'forward', 'comment', 'like', 'text',
       'image', 'video', 'vip', 'topic', 'topic-date', 'subject', 'referred',
       'follower_num', 'account_url', 'parsed_forward', 'parsed_comment',
       'parsed_like', 'parsed_follower', 'exact_topic', 'parsed_post_date'],
      dtype='object')

In [59]:
false_cleaned.columns

Index(['account_name', 'post_date', 'forward', 'comment', 'like', 'text',
       'image', 'video', 'vip', 'topic', 'topic-date', 'subject', 'referred',
       'follower_num', 'account_url', 'parsed_post_date', 'parsed_forward',
       'parsed_comment', 'parsed_like', 'parsed_follower', 'exact_topic'],
      dtype='object')

In [60]:
modeling = pd.concat([true_cleaned, false_cleaned])
len(modeling)

44031

In [61]:
modeling.to_csv("output/modeling_data_cleaned.csv", encoding="utf-8", index=False)