## Import the packages

In [14]:
import pkuseg
import re
import pandas as pd

## Helper functions
1. read_csv -> convert csv file to dataframe
2. read_txt -> convert txt file to list
3. remove_notation -> only remain Chinese characters, English word and numbers of a string
4. sentence_preprocess -> remove media words of weibo

In [15]:
def read_csv(file_path):
    dataframe = pd.read_csv(file_path, encoding='utf-8')
    return dataframe

def read_txt(file_path):
    stopwords = [line.strip() for line in open(file_path, 'r').readlines()]
    return stopwords

def remove_notation(dirty_word):
    content = re.compile('[^\u4e00-\u9fa5a-zA-Z0-9]')
    clean_word = content.sub('', dirty_word)
    return clean_word

def sentence_preprocess(item):
    medias = read_txt('../dataset/media.txt')
    content = re.compile('L[\u4e00-\u9fa5a-zA-Z0-9]*的[\u4e00-\u9fa5]*视.')
    item = content.sub('', item)
    content = re.compile('（凤凰卫视[\u4e00-\u9fa5a-zA-Z0-9]*报道）')
    item = content.sub('', item)
    content = re.compile('看凤凰专题[\u4e00-\u9fa5a-zA-Z0-9]*')
    item = content.sub('', item)

    for i in medias:
        item = item.replace(i, "")
    return item

## Initialize the word segmentation tool

In [16]:
lexicon = ['特朗普', '微博']
seg = pkuseg.pkuseg(model_name='default', user_dict=lexicon)

## Read Weibo dataset

In [17]:
df = pd.read_csv('../dataset/trump/trump_dataset.csv')
doc_list = df['微博正文'].astype(str)

## Preprocess the Weibo dataset
1. clean
2. remove stop words
2. word segmentation

In [18]:
rst = []

for doc in doc_list:
    doc = sentence_preprocess(doc)
    rst.append(seg.cut(doc))

result = []

for words in rst:
    new_words = []
    for word in words:
        new_word = remove_notation(word)
        if len(new_word) > 1:
            new_words.append(new_word)
    result.append(new_words)

stop_words_list = read_txt('../dataset/stop_words.txt')

result1 = []
for words in result:
    parse_sentence = ''
    for word in words:
        if word not in stop_words_list:
            if word != '\t':
                parse_sentence += word
                parse_sentence += " "
    result1.append(parse_sentence)

## Write the preprocessed text data back to the original csv file

In [19]:
del df['微博正文']
df['微博正文'] = result1
df.to_csv('../dataset/trump/preprocessed_v3.csv')

## Remove empty Weibo data of the csv file

In [20]:
df = pd.read_csv('../dataset/trump/preprocessed_v3.csv')
df = df[pd.notnull(df['微博正文'])]
df.to_csv('../dataset/trump/preprocessed_v3.csv')