## pySpark 斷詞後存進 Hadoop

In [None]:
# 起 spark session
import pandas as pd
import numpy as np
import jieba
import findspark
findspark.init('/usr/local/spark')
# 載入必要module
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql import types
ss = SparkSession.builder \
    .master( "spark://192.168.1.52:7077" ) \
    .appName( "cut seg wiki" ) \
    .config( "spark.cores.max", "4" ) \
    .config( "spark.executor.memory", "12g" ) \
    .config( "spark.driver.memory", "24g" ) \
    .config( "spark.driver.maxResultSize", "16g" ) \
    .getOrCreate()

# 


### 斷詞

In [None]:
# 匯入停用詞
stopWord_set = set()
stopWords_path = '/home/stat_jerry/stop_word_test1800.txt'
with open( stopWords_path, 'r', encoding='utf-8') as stopwords:
    for stopword in stopwords:
        stopWord_set.add(stopword.strip('\n'))
        
# 設定 UDF 使用 jieba 斷詞，並依停用詞來篩選。最後輸出格式為 array, 內容為 str
def cut_stopWord(w):
    l = []
    seg = jieba.cut(w)
    for word in seg:
        if word not in stopWord_set:
            if not word.isdigit():
                if len( word ) > 1:
                    l.append(word)
    return l
#cut_udf = py_udf(lambda w: [ i for i in jieba.cut(w) if i not in stopWord_set], types.ArrayType(types.StringType()) )
cut_udf = fn.udf( cut_stopWord, types.ArrayType(types.StringType()) )
#
# 讀取資料並 repartition
wiki_df = ss.read.csv( path = "hdfs://192.168.1.53:9000/corpus/temp/temp_wiki_20180821",
                       sep = "\t" )
wiki_df = wiki_df.toDF( "ID", "url", "Title", "text" )
wiki_df = wiki_df.repartition(120)
# 開始斷詞
wiki_df_cut = wiki_df.select( 'ID', 'url', 'Title', 'text', cut_udf('text').alias('words') )
#


### 資料存檔與讀取( array_string_transform )

In [None]:
# 設定 array to string 的 function
def array_to_string(array_col):
    return ','.join([str(elem) for elem in array_col])

array_to_string_udf = fn.udf(array_to_string, types.StringType())

wiki_df_cut = wiki_df_cut.withColumn('column_as_str', array_to_string_udf(wiki_df_cut["words"]))
wiki_df_cut.drop("words").write.csv( path = "hdfsw192.168.1.53:9000/corpus/temp/wiki_cut_20180909",
                                     mode = "overwrite",
                                     sep = "\t",
                                     compression = "gzip" )
#
# 讀檔，並將 string to array
wiki_df_cut = ss.read.csv( path = "hdfs://192.168.1.53:9000/corpus/temp/wiki_cut_20180909",
                           sep = "\t" )
wiki_df_cut = wiki_df_cut.toDF( 'ID', 'url', 'Title', 'text', 'column_as_str')
wiki_df_cut = wiki_df_cut.withColumn( "words", fn.split( fn.col("column_as_str"), "," ).cast("array<string>") )

ss.stop()