# Initial Data Cleaning

In [82]:
import numpy as np
import pandas as pd
import pickle
import re
import datetime

import dc_func

## Clean Reuters

In [2]:
df = pd.read_csv('reuters.tsv', sep='\t')
df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
# Trim date values to month day, year
df['date'] = df['date'].map(lambda x: dc_func.str_to_date(x).group(0))

# Convert date to datetime
df['date'] = df['date'].astype('datetime64[ns]')

In [4]:
# Find all indices where date is before protests
df.loc[df['date'] < pd.Timestamp(2019, 3, 15)] # index = 363

# Drop row
df.drop(index=363, inplace=True)

# Drop rows with no text body
df.dropna(inplace=True)

In [5]:
# Only keep text in body
df['body'] = df['body'].map(lambda x: dc_func.clean_body_reu(x))

In [6]:
# Drop duplicate headlines
df.drop_duplicates(subset='headline', inplace=True)

# Drop duplicate body
df.drop_duplicates(subset='body', inplace=True)

In [7]:
# Drop articles not pertaining the HK
df = dc_func.not_HK(df)

In [8]:
# Save tsv for Reuters
# df.to_csv('clean_reuters.tsv', sep='\t')

### Notes
- Total articles: 466

## Clean CCTV

In [9]:
df = pd.read_csv('cctv.tsv', sep='\t')
df.drop(columns='Unnamed: 0', inplace=True)

In [10]:
# Change date strings to datetime, drop duplicates
df['date'] = df['date'].astype('datetime64[ns]')
df.drop_duplicates(subset='body', inplace=True)

In [11]:
df['body'] = df['body'].map(lambda x: dc_func.clean_body_cctv(x))

In [12]:
df = dc_func.not_HK(df)

In [13]:
# Save tsv for CCTV
# df.to_csv('clean_cctv.tsv', sep='\t')

### Notes
- Total articles: 182

## Clean ABC

In [14]:
df = pd.read_csv('abc.tsv', sep='\t')
df.drop(columns='Unnamed: 0', inplace=True)

In [15]:
# Convert date to datetime objects
df['date'] = df['date'].map(lambda x: dc_func.con_to_string(x))
df['date'] = df['date'].astype('datetime64[ns]')

In [16]:
# Drop duplicates from headline
df.drop_duplicates(subset='headline', inplace=True)

In [17]:
# Drop empty body row
df.dropna(inplace=True)

In [18]:
df['body'] = df['body'].map(lambda x: dc_func.clean_body_abc(x))

In [19]:
df = dc_func.not_HK(df)

In [20]:
# df.to_csv('clean_abc.tsv', sep='\t')

### Notes
- Total articles: 111

## Clean CNN

In [24]:
df = pd.read_csv('cnn.tsv', sep='\t')
df.drop(columns='Unnamed: 0', inplace=True)
df.dropna(inplace=True)

In [28]:
df['date'] = df['date'].map(lambda x: dc_func.string_to_date_cnn(x)).astype('datetime64[ns]')
df['body'] = df['body'].map(lambda x: dc_func.clean_body_cnn(x))

In [31]:
df = dc_func.not_HK(df)

In [33]:
# df.to_csv('clean_cnn.tsv', sep='\t')

### Notes
- Total articles: 96

## Clean SCMP

In [122]:
dft1 = pd.read_csv('scmp0_200.tsv', sep='\t')
dft2 = pd.read_csv('scmp200_400.tsv', sep='\t')
dft3 = pd.read_csv('scmp400_600.tsv', sep='\t')
dft4 = pd.read_csv('scmp600_800.tsv', sep='\t')
dft5 = pd.read_csv('scmp800_1000.tsv', sep='\t')
dft6 = pd.read_csv('scmp1000_1200.tsv', sep='\t')
dft7 = pd.read_csv('scmp1200_1400.tsv', sep='\t')
dft8 = pd.read_csv('scmp1400_1600.tsv', sep='\t')
dft9 = pd.read_csv('scmp1600_1700.tsv', sep='\t')
dft10 = pd.read_csv('scmp1700_1800.tsv', sep='\t')
dft11 = pd.read_csv('scmp1800_1900.tsv', sep='\t')
dft12 = pd.read_csv('scmp1900_2000.tsv', sep='\t')
dft13 = pd.read_csv('scmp2000_2100.tsv', sep='\t')
dft14 = pd.read_csv('scmp2100_end.tsv', sep='\t')

In [123]:
dfts = [dft2, dft3, dft4, dft5, dft6, dft7, dft8, dft9, dft10, dft11, dft12, dft13, dft14]
for frame in dfts:
    dft1 = pd.concat([dft1, frame])

In [125]:
dft1.drop(columns='Unnamed: 0', inplace=True)

In [127]:
dft1.reset_index(inplace=True)

In [129]:
dft1.drop(columns='index', inplace=True)

In [132]:
dft1.dropna(inplace=True)

In [136]:
dft1['date'] = dft1['date'].astype('datetime64[ns]')

In [142]:
# dft1.to_csv('clean_scmp.tsv', sep='\t')

# Merge Data

In [143]:
df_scmp = pd.read_csv('clean_scmp.tsv', sep='\t')
df_reu = pd.read_csv('clean_reuters.tsv', sep='\t')
df_cctv = pd.read_csv('clean_cctv.tsv', sep='\t')
df_abc = pd.read_csv('clean_abc.tsv', sep='\t')
df_cnn = pd.read_csv('clean_cnn.tsv', sep='\t')

In [144]:
# Combine all DataFrames from different news outlets
frames = [df_scmp, df_reu, df_cctv, df_abc, df_cnn]
df = pd.concat(frames)

# Standardize
df.reset_index(inplace=True)
df.drop(columns=['index', 'Unnamed: 0'], inplace=True)
df['date'] = df['date'].astype('datetime64[ns]')

In [146]:
# file = open('articles.p', 'wb') 
# pickle.dump(df, file)                      
# file.close()