**Purpose**: Process the raw data to data we can fit to model in the next step.
- Concat data
- format data
- insert feature variable
- sent segmentation
- tokenization.

In [1]:
import os
import glob
import re
import pandas as pd
import time

In [2]:
# process the csv files and constuct a df

def process_csv(source, base_path, date_format, publisher):
    
    # combine csv   
    os.chdir(source)
    all_filenames = [i for i in glob.glob(source + '*.csv')]
    df = pd.concat([pd.read_csv(f) for f in all_filenames]) # concat all csv
    
    # drop duplicates and faulty data
    df.drop_duplicates(inplace=True)                 # delete duplicate
    df = df[df['Content'].astype(str).map(len)> 50]  # delete articles with content less than 50 characters
                                                     # This will drop all the failed cases too. 

        
    # feature and formatting
    df['Is_offical']=df['Author'].apply(lambda offical: offical==publisher)
    df['Content_len']=df['Content'].str.len()
    df['Date']=pd.to_datetime(df['Date'],format=date_format) # Convert date str to date
    df.sort_values(by=['Date','Author','Category'],inplace=True) # sort data
    df = df.reset_index(drop=True)                               # reset index
        
    # export to csv
    df.to_csv(base_path + publisher + "_concat.csv", index=False) #encoding='utf-8-sig'
    print('Saved: ' + base_path + publisher + "_concat.csv")
    
    return df

In [17]:
stand_news_folder = 'C:/Users/sunny/Desktop/news_nlp/stand_news_collection/'
sing_tao_daily_folder = 'C:/Users/sunny/Desktop/news_nlp/sing_tao_daily_collection/'
base_path = 'C:/Users/sunny/Desktop/news_nlp/'
date_format='%Y/%m/%d — %H:%M'
publisher='立場報道'

stand_news_df = process_csv(stand_news_folder, base_path, date_format, publisher)

Saved: C:/Users/sunny/Desktop/news_nlp/立場報道_concat.csv


Lets examine the raw data.

In [18]:
stand_news_df.head(5)

Unnamed: 0,Title,Author,Category,Date,Link,Content,Is_offical,Content_len
0,三大社運種籽發芽　推倒送中惡法可勝,練乙錚,政治,2019-06-01 00:00:00,https://www.thestandnews.com/politics/%E4%B8%8...,反對修訂《逃犯條例》的運動正風起雲湧，政府出盡吃奶力投入的「正能量」，卻成為運動的火種。大家...,False,3162
1,一座現代城市的編年史，我心中主流神劇之最：HBO《The Wire》（篇九）,陳裕匡,文化,2019-06-01 09:55:00,https://www.thestandnews.com/culture/%E4%B8%80...,（九）要你專注的一種形式本文含有《The Wire》的Spoilers，請斟酌使用身邊在看、...,False,883
2,李顯龍：中國壯大改變戰略平衡，中國與全球都須調整適應，中美衝突雙方都不可能垮台,立場報道,國際,2019-06-01 10:00:00,https://www.thestandnews.com/international/%E6...,新加坡總理李顯龍表示，中國自改革開放後取得的增長改變了戰略平衡，也轉移了世界的經濟重心。中國...,True,1525
3,不是「代表你」，而是為自己發聲,邢福增,政治,2019-06-01 10:16:00,https://www.thestandnews.com/politics/%E4%B8%8...,在多元化的社會，群體有不同意見，是十分正常的現象。在一元化的社會，掌權者卻聲稱自己擁有最廣泛...,False,704
4,美國務卿警告德國：使用華為設備就不要想共享情報,立場報道,國際,2019-06-01 10:46:00,https://www.thestandnews.com/international/%E7...,路透社報道，美國周五（5月31日）向西方盟友施壓，稱允許中國華為建設電信基礎設施的國家可能被...,True,778


In [20]:
stand_news_df['Content_len'].describe()    # article length

count    13257.000000
mean      1201.547635
std       1304.898046
min         56.000000
25%        538.000000
50%        871.000000
75%       1413.000000
max      33142.000000
Name: Content_len, dtype: float64

In [8]:
# Sentence Segmentation and Tokenization 
def sent_segmentation_and_tokenization(base_path,publisher):
    import jieba
    import pickle

    start = time.perf_counter()

    # Sentence Segmentation

    raw_csv_path=os.path.join(base_path, publisher + '_concat.csv')
    df = pd.read_csv(raw_csv_path)
    #df = df.loc[df['Author']=='立場報道'] # only keep article written by publisher
    df['Sentence']= df['Content'].apply(lambda text: re.split("\n|。」|！」|\!」|？」|\?」|。|？|\?|！|!|;|；",str(text)))


    # Tokenization 
    os.chdir(base_path)
    jieba.set_dictionary('dict.txt.big')
    jieba.load_userdict("hk_dict.txt") # dictionary of Hong Kong local names and terms
    sent_ls = df['Sentence'].tolist()
    sent = [list(jieba.cut(sent,cut_all=False)) for article in sent_ls for sent in article]

    sent_pkl = os.path.join(base_path, publisher + '_sent.pkl')



    with open(sent_pkl, "wb") as f:   #Pickling
        pickle.dump(sent, f)

    finish = time.perf_counter()
    print(f'Finished sentence segementation and tokenization in {round(finish-start, 2)} secound(s)')


In [35]:
# load sent using pickle
with open(sent_pkl, "rb") as f:   # Unpickling
    sent= pickle.load(f)

In [40]:
sent[0]

['反對',
 '修訂',
 '《',
 '逃犯條例',
 '》',
 '的',
 '運動',
 '正風',
 '起雲',
 '湧',
 '，',
 '政府',
 '出盡',
 '吃奶',
 '力',
 '投入',
 '的',
 '「',
 '正',
 '能量',
 '」',
 '，',
 '卻',
 '成為',
 '運動',
 '的',
 '火種']

***

In [5]:
sing_tao_daily_folder = 'C:/Users/sunny/Desktop/news_nlp/sing_tao_daily_collection/'
base_path = 'C:/Users/sunny/Desktop/news_nlp/'
date_format='%Y-%m-%d'
publisher='星島日報'

std_news_df = process_csv(sing_tao_daily_folder, base_path, date_format, publisher)

Saved: C:/Users/sunny/Desktop/news_nlp/星島日報_concat.csv


In [6]:
std_news_df.head(5)

Unnamed: 0,Title,Author,Category,Date,Link,Content,Is_offical,Content_len
0,韓國瑜否認有私生女 控綠媒董事長誹謗,星島日報,中國,2019-06-01,http://std.stheadline.com/daily/article/detail...,（星島日報報道）台灣人氣王韓國瑜的支持者，預備在今日於台北的總統府前凱達格蘭大道舉行挺韓...,True,821
1,台外島部署火箭系統 射程覆蓋閩境大城市,星島日報,中國,2019-06-01,http://std.stheadline.com/daily/article/detail...,（星島日報報道）台灣媒體透露，台灣軍方已將自己研發的「雷霆2000」多管火箭系統部署在大...,True,498
2,魏鳳和晤美防長 強調維護主權決心,星島日報,中國,2019-06-01,http://std.stheadline.com/daily/article/detail...,（星島日報報道）中國國防部長魏鳳和，昨天傍晚同美國代理防長沙納漢在新加坡舉行了「積極、建...,True,479
3,霍士女主播再邀戰 劉欣：非常願意,星島日報,中國,2019-06-01,http://std.stheadline.com/daily/article/detail...,（星島日報報道）中美女主播「貿易戰辯論」前日重磅登場。雙方就貿易關稅、知識產權、華為問題...,True,620
4,國旗印字賣廣告 湖南車企涉違法,星島日報,中國,2019-06-01,http://std.stheadline.com/daily/article/detail...,（星島日報報道）湖南一批學生，日前公開展示一面國旗，但國旗上竟印有湖南汽車生產企業「哪吒...,True,450


In [9]:
sent_segmentation_and_tokenization(base_path,publisher)

Building prefix dict from C:\Users\sunny\Desktop\news_nlp\dict.txt.big ...
Dumping model to file cache C:\Users\sunny\AppData\Local\Temp\jieba.uad31746bf4d8ac03e4f91994235f9d80.cache
Loading model cost 1.164 seconds.
Prefix dict has been built successfully.


Finished sentence segementation and tokenization in 33.92 secound(s)


***

In [110]:
# Experiment with 'keep' and 'inplace'
# keep=False: delete duplicate rows, even the 'original' (options: "first","last",False)
# inplace=True: direct make changes to the dataframe instead of create a copy

import pandas as pd
df1 = pd.DataFrame({'Key': ['a','b','a'], 'data1': ['a', 2, 3]})
df2 = pd.DataFrame({'Key': ['a', 'b', 'd'], 'data1': ['a', 'boy', 'dog']})
result = pd.concat([df1,df2])

print(result)

result.drop_duplicates(inplace=True)
# result.drop_duplicates(keep="first",inplace=True)
# result.drop_duplicates(keep=False,inplace=True)
print(result)

  Key data1
0   a     a
1   b     2
2   a     3
0   a     a
1   b   boy
2   d   dog
  Key data1
0   a     a
1   b     2
2   a     3
1   b   boy
2   d   dog
