## Translate file to English

Save the column text into an Excel and use Google Translate Document for translation

Load the translated version back as local dictionary and translate accordingly. 

In [None]:
import os
import pandas as pd

def get_file_names(input_directory):
    result_list = []
    for file_name in os.listdir(input_directory):
        if file_name.endswith(".csv"):
            result_list.append(file_name)
    return result_list


def get_data(input_directory, file_list):
    df_list=[] 
    outdata = pd.DataFrame()
    for file_name in file_list:
        path = input_directory + "/" + file_name
        data = pd.read_csv(path)
        data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

        df_list.append(data)
    return df_list

dir_name = 'topic_result_v1'
df_list = get_data(dir_name, get_file_names(dir_name))

In [54]:
import jieba.analyse
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
import jieba

word_list = []
for df in df_list:
    l = df.topic_keyword.tolist()
    l = ' '.join(l)

    tags = jieba.analyse.extract_tags(l, topK=1000, withWeight=True)
    freq_dict = dict(tags)
    
    new_word_list = [a_tuple[0] for a_tuple in tags]
    
    word_list= list(set(word_list) | set(new_word_list))

In [126]:
'会' in new_word

True

In [125]:
df = pd.DataFrame(new_word)
df

Unnamed: 0,0
0,不明
1,原因
2,专家组
3,进行
4,病毒性
...,...
506,一定
507,出门
508,肯定的
509,飞沫


In [127]:
df.to_excel("topic_result.xlsx")  


In [128]:
import csv
translation_dict_df = pd.read_csv("topic_result_dictionary.csv")
translation_dict_df

Unnamed: 0,ch,en
0,不明,unknown
1,原因,the reason
2,专家组,Expert Group
3,进行,get on
4,病毒性,Viral
...,...,...
506,一定,for sure
507,出门,Go out
508,肯定的,definitely
509,飞沫,Droplets


In [129]:
# create a local ch-en dictionary
translation_dict = translation_dict_df.set_index('ch').to_dict()['en']
len(translation_dict)

511

In [136]:
for df in df_list:
    df['topic_keyword_en']=df['topic_keyword'].apply(translate_topic_keyword)

In [142]:
df_list[3].date

Unnamed: 0,id,label,keyword,date,content,time,forward,comment,like,predict,__label__positive,__label__negative,__label__neutral,predict_score,topic,topic_keyword,topic_keyword_en
0,1,-1,新冠,2020-03-15,天津 新冠 肺炎 确诊 病例 清零,2020-03-15 23:00:00,0,0,0,__label__positive,0.47659,0.17782,0.00001,0.47659,2.0,"确诊, 病例, 例, 西班牙, 新增","Confirmed diagnosis, Case, example, Spain, New"
1,2,-1,新冠,2020-03-15,钟南山 说 新冠有 可能 不是 源自 于 中国 赵立坚 质疑 美国 军人 通过 武汉 军运会...,2020-03-15 08:01:00,0,0,0,__label__neutral,0.18714,0.10670,0.20182,0.20182,0.0,"国家, 病毒, 免疫, 群体, 一个","country, virus, Immunity, group, One"
2,3,-1,新冠,2020-03-15,放屁 者 让 闻到 的 人 不会 感染 新冠 肺炎 病毒 大伙 终于 安心,2020-02-23 08:43:00,1,1,0,__label__negative,0.20182,0.81288,0.00050,0.81288,0.0,"国家, 病毒, 免疫, 群体, 一个","country, virus, Immunity, group, One"
3,4,-1,新冠,2020-03-15,不要 点破 玻璃心 受不了 国家 卫健委 高级别 专家 组成员 香港大学 新 发 传染性 疾...,2020-03-15 08:08:00,0,0,0,__label__positive,0.34159,0.08271,0.12253,0.34159,0.0,"国家, 病毒, 免疫, 群体, 一个","country, virus, Immunity, group, One"
4,5,-1,新冠,2020-03-15,天气 转暖后 新冠 病毒 会 偃旗息鼓 吗 o 天气 转暖后 新冠 病毒 会 偃旗息鼓 吗,2020-03-15 22:59:00,0,0,0,__label__negative,0.00001,0.99684,0.00151,0.99684,1.0,"病毒, 中国, 西班牙, 二千, 美国","virus, China, Spain, Two thousand, United States"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,1035,-1,新冠,2020-03-15,目前 看 全球 新冠 抗疫于 今年夏天 结束 已 不现实 进一步 跨年度 的 风险 加大,2020-03-15 22:47:00,0,1,0,__label__negative,0.05034,0.77731,0.00213,0.77731,3.0,"德国, 医院, 患者, 病毒, 十五日","Germany, hospital, patient, virus, 15th"
745,1036,-1,新冠,2020-03-15,美国 这个 缓慢的 检测 速度 真的 要 逼 疯 我 然而 大 统领 竟然 还 在 周五 的...,2020-03-15 12:49:00,0,0,0,__label__negative,0.03211,0.80318,0.04604,0.80318,4.0,"美国, 中国, 英国, 会, 境外","United States, China, United Kingdom, meeting,..."
746,1037,-1,新冠,2020-03-15,西班牙 一日 内 新增 二千 例新冠 肺炎 当地 时间 十五日 十一 三十 西班牙 卫生部 ...,2020-03-15 22:05:00,0,0,0,__label__neutral,0.00074,0.00650,0.99495,0.99495,2.0,"确诊, 病例, 例, 西班牙, 新增","Confirmed diagnosis, Case, example, Spain, New"
747,1038,-1,新冠,2020-03-15,今天 认真 看 了 有关 群体 免疫 的 介绍 与 讨论 如果 新冠 对 标的 是 一千 九...,2020-03-15 22:47:00,0,0,0,__label__negative,0.02845,0.88081,0.01800,0.88081,3.0,"德国, 医院, 患者, 病毒, 十五日","Germany, hospital, patient, virus, 15th"


In [144]:
for df in df_list:
    date = df['date'][0]
#     print (date)
    df.to_csv("processed_data"+date+".csv")

In [124]:
def to_list(s):
    return s.strip('').split(', ')

new_word = []
for df in df_list:
    for word_list in df.topic_keyword.apply(to_list):
        for word in word_list:
            if not word in new_word:
                new_word.append(word)
'例' in new_word    

True

In [99]:
df_list[1].topic_keyword#[0].strip('').split(', ') 

0         中国, 国家, 病毒, 全球, 境外
1         中国, 病例, 确诊, 五万, 以外
2          例, 患者, 三月, 一例, 症状
3          例, 患者, 三月, 一例, 症状
4          例, 患者, 三月, 一例, 症状
               ...          
203      意大利, 病例, 确诊, 三月, 中国
204        例, 患者, 三月, 一例, 症状
205       中国, 病例, 确诊, 五万, 以外
206       中国, 国家, 病毒, 全球, 境外
207    全国, 法令, 西班牙政府, 封锁, 确诊
Name: topic_keyword, Length: 208, dtype: object

In [135]:
def translate_topic_keyword(topic_keyword):
    topic_keyword_en = []
    topic_keyword_list = topic_keyword.strip('').split(', ') 
    for word in topic_keyword_list:
        if word in translation_dict:
            word_en = translation_dict.get(word)
        else: word_en = word
        topic_keyword_en.append(word_en)
        res = ', '.join(topic_keyword_en)
#     print (res)
    return res

In [97]:
translate_topic_keyword(df_list[1].topic_keyword[0])

China, country, virus, global, Overseas


['China', 'country', 'virus', 'global', 'Overseas']

In [131]:
df_list[1].topic_keyword.apply(translate_topic_keyword)[0]

['China', 'country', 'virus', 'global', 'Overseas']

In [100]:
translation_dict.get('例')

In [103]:
'一例' in translation_dict

True