In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pythainlp
!pip install python-crfsuite

Collecting pythainlp
  Downloading pythainlp-3.0.8-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 4.4 MB/s 
[?25hCollecting tinydb>=3.0
  Downloading tinydb-4.7.0-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, pythainlp
Successfully installed pythainlp-3.0.8 tinydb-4.7.0
Collecting python-crfsuite
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 6.8 MB/s 
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.8


In [None]:
import pandas as pd
import json
import random
from tqdm import tqdm

import pythainlp
import pycrfsuite
from pythainlp.tokenize.crfcut import segment

In [None]:
def preprocess_sentencepiece(x):
    try:
        output = []
        
        sentences = segment(" ".join(x))
        for sentence in sentences:
            temp = sentence.strip()
            if temp == "":
                if len(output) != 0 and output[-1] != " ":
                    output.append(" ")
            else:
                output.append(temp)
        return output
        
    except Exception as e:
        print(e)
        print(x)
        return []

In [None]:
def to_list(x):
    output = []
    output.append(x)
    return output

In [None]:
def process_data(file_path, out_path):
    df = pd.read_json(path_or_buf=file_path, lines=True)

    df = df.loc[df.astype(str).drop_duplicates().index]

    # Matichon
    # df = df.drop([713, 763, 853, 859]) 

    # Kapook
    # df = df.drop([836]) 
    # df["Detail"] = df["Detail"].apply(lambda x: x[:-4])
    
    df["Title"] = df["Title"].apply(to_list)
    df["Title"] = df["Title"].apply(preprocess_sentencepiece)
    df["Detail"] = df["Detail"].apply(preprocess_sentencepiece)

    df["Document Tag"] = "Fact News"

    df['json'] = df.apply(lambda x: x.to_json(), axis=1)

    file_data = open(out_path, "w", encoding="utf8")

    for index, row in df.iterrows():
        file_data.write(row["json"]+"\n")  

    file_data.close()

## Pptvhd36

In [None]:
def process_pptvhd36_data(file_path, out_path):
    df = pd.read_json(path_or_buf=file_path, lines=True)
    df = df.loc[df.astype(str).drop_duplicates().index]
    
    df["Title"] = df["title"].apply(to_list)
    df["Title"] = df["Title"].apply(preprocess_sentencepiece)

    df["subcontent"] = df["subcontent"].apply(to_list)
    df["subcontent"] = df["subcontent"].apply(preprocess_sentencepiece)

    df["content"] = df["content"].apply(preprocess_sentencepiece)

    df["Detail"] = df["subcontent"].add(df["content"])
    df["Document Tag"] = "Fact News"
    df.drop(["title", "subcontent", "content"], axis=1, inplace=True)

    df['json'] = df.apply(lambda x: x.to_json(), axis=1)
    
    file_data = open(out_path, "w", encoding="utf8")

    for index, row in df.iterrows():
        file_data.write(row["json"]+"\n")  

    file_data.close()

## BBC

In [None]:
def process_bbc_data(file_path, out_path):
    df = pd.read_json(path_or_buf=file_path, lines=True)

    df = df.loc[df.astype(str).drop_duplicates().index]
    
    df["Title"] = df["title"].apply(to_list)
    df["Title"] = df["Title"].apply(preprocess_sentencepiece)
    df["Detail"] = df["content"].apply(preprocess_sentencepiece)

    df["Document Tag"] = "Fact News"

    df.drop(["title", "content"], axis=1, inplace=True)

    df['json'] = df.apply(lambda x: x.to_json(), axis=1)

    file_data = open(out_path, "w", encoding="utf8")

    for index, row in df.iterrows():
        file_data.write(row["json"]+"\n")  

    file_data.close()

## Fake News

In [None]:
def process_fake_news_data(file_path, file_data):
    f = open(file_path, "r")
    temp = open("/content/temp.json", "w", encoding="utf8")

    title = []
    detail = []

    isSeparator = False
    isTitle = False

    for line in f:
        line = line.strip()
        if line == "-[separator]-":
            isSeparator, isTitle = True, False
            detail.append("")
            continue
        
        if isSeparator and not isTitle:
            title.append(line)
            isSeparator, isTitle = True, True
        elif isSeparator and isTitle:
            detail[-1] += line

    for i in range(len(title)):
    
        data = {
            "Title": title[i],
            "Detail": detail[i]
        }

        json_string = json.dumps(data, ensure_ascii=False)

        temp.write(json_string+"\n")  

    temp.close()
    f.close()

    process_fake_news_data_helper("/content/temp.json", file_data)

In [None]:
def process_fake_news_data_helper(file_path, out_path):
    df = pd.read_json(path_or_buf=file_path, lines=True)

    df = df.loc[df.astype(str).drop_duplicates().index]
    
    df["Title"] = df["Title"].apply(to_list)
    df["Title"] = df["Title"].apply(preprocess_sentencepiece)
    df["Detail"] = df["Detail"].apply(to_list)
    df["Detail"] = df["Detail"].apply(preprocess_sentencepiece)

    df["Document Tag"] = "Fake News"

    df['json'] = df.apply(lambda x: x.to_json(), axis=1)

    file_data = open(out_path, "w", encoding="utf8")

    for index, row in df.iterrows():
        file_data.write(row["json"]+"\n")  

    file_data.close()

In [None]:
file_path = "/content/drive/MyDrive/Pattern/raw/kapook_data.json"
file_data = "/content/drive/MyDrive/Pattern/dataset/sentencepiece/kapook_data_sentencepiece_dataset.json"
process_data(file_path, file_data)

In [None]:
check = pd.read_json(path_or_buf=file_data, lines=True)
check.head()

Unnamed: 0,Title,Detail,Document Tag
0,"[WHO เผยยอดตายอีโบลา พุ่งทะลุ 1,900 รายแล้ว]","[WHO เผยยอดตายอีโบลา พุ่งทะลุ 1,900 รายแล้วภาพ...",Fact News
1,"[หนุ่มอดีตเด็กอ้วนฮึดฟิตหุ่นล่ำ แต่กลับป่วย ""ไ...","[น้ำหนักเกินร้อย, หันมาฟิตหุ่นจนล่ำกล้ามโต แต่...",Fact News
2,"[อันตราย !, พบสารกันบูดในขนมจีนสูงเกินมาตรฐาน,...","[มูลนิธิเพื่อผู้บริโภค, สุ่มตรวจ พบตกค้างในขนม...",Fact News
3,[กรมอนามัย เตือน กินไข่ดิบเสี่ยงปนเปื้อนเชื้อจ...,[10 ตุลาคม 2557 กรมอนามัย เตือน การรับประทานไข...,Fact News
4,[ดีเอสไอ บุกค้นโรงงานผลิตอาหารเสริม เมโซ หลังพ...,[ดีเอสไอ บุกค้นโรงงานผลิตอาหารเสริม เมโซ หลังพ...,Fact News


In [None]:
check.describe()

Unnamed: 0,Title,Detail,Document Tag
count,1019,1019,1019
unique,1019,1019,1
top,"[WHO เผยยอดตายอีโบลา พุ่งทะลุ 1,900 รายแล้ว]","[WHO เผยยอดตายอีโบลา พุ่งทะลุ 1,900 รายแล้วภาพ...",Fact News
freq,1,1,1019


In [None]:
def merge_sentencepiece():
    file_paths = ["/content/drive/MyDrive/Pattern/dataset/sentencepiece/bbc_data_sentencepiece_dataset.json",
              "/content/drive/MyDrive/Pattern/dataset/sentencepiece/fake_news_data_sentencepiece_dataset.json",
              "/content/drive/MyDrive/Pattern/dataset/sentencepiece/kapook_data_sentencepiece_dataset.json",
              "/content/drive/MyDrive/Pattern/dataset/sentencepiece/matichon_data_sentencepiece_dataset.json",
              "/content/drive/MyDrive/Pattern/dataset/sentencepiece/pptvhd36_data_sentencepiece_dataset.json",
              "/content/drive/MyDrive/Pattern/dataset/sentencepiece/sanook_data_sentencepiece_dataset.json"]

    out_path = "/content/drive/MyDrive/Pattern/dataset/sentencepiece/sentencepiece.txt"

    df = None

    for file_path in file_paths:
        _df = pd.read_json(path_or_buf=file_path, lines=True)
        df = pd.concat([df, _df])

    out = open(out_path, "w", encoding="utf8") 
    for idx, row in tqdm(df.iterrows()):
        for i in row[0]:
            out.write(i+"\n")
        for j in row[1]:
            out.write(j+"\n")
    out.close()

In [None]:
merge_sentencepiece()

11852it [00:01, 7375.36it/s]
