참조 https://github.com/pko89403/Spark-Test/blob/master/AdvancedAnalyticswithSpark/ch6/LSA.ipynb

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

jar_path = '/usr/spark/jars/spark-xml_2.12-0.11.0.jar'
spark = SparkSession.builder.appName('Chapter06')\
    .master('local[4]')\
    .config("spark.executor.memory", "2g")\
    .config("spark.jars", jar_path)\
    .getOrCreate()

df = spark.read.format("xml") \
                .option("rowTag", "page") \
                .load("wiki_ml.xml")
#                 .load(''/home/jovyan/data/wiki_ml.xml')\
#                 .load(os.getcwd() + 'wiki_ml.xml')

In [2]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _space: string (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)



In [3]:
# html data 형태: title, revision의 text안의 _value가 본문, _space, _byte는 metadata
parsedDF = df.select("title", "revision.text._VALUE")

In [4]:
parsedDF = parsedDF.withColumn("raw_text",F.col("_VALUE")).drop("_VALUE")

In [5]:
parsedDF.show(5)

+--------------------+--------------------+
|               title|            raw_text|
+--------------------+--------------------+
|     Bongard problem|[[File:Bongard_pr...|
|    Generative model|{{About|generativ...|
|      Inductive bias|The '''inductive ...|
|Category:Bayesian...|{{Cat main|Bayesi...|
|Category:Classifi...|{{Commons categor...|
+--------------------+--------------------+
only showing top 5 rows



In [6]:
# 기본적으로 column 끼리 같은 row단위로 처리
# column name인 raw_text string으로 받기 때문에 raw_text가 "[^a-zA-Z0-9\\s]" 변함
# 나머지 아래 값들은 raw_text안의 값들로 지정. 따라서 정규식이 함수처럼 역할
# dataframe내에서 정규식 쓰러면 regexp_replace 
df_clean  = parsedDF.select('title',F.lower(F.regexp_replace('raw_text', "[^a-zA-Z0-9\\s ]", " ")).alias('text'))


In [7]:
df_clean.show(2)

+----------------+--------------------+
|           title|                text|
+----------------+--------------------+
| Bongard problem|  file bongard pr...|
|Generative model|  about generativ...|
+----------------+--------------------+
only showing top 2 rows



In [8]:
import pyspark.ml.feature as ml


# Tokenizer는 python의 string split 역할.

tokenizer = ml.Tokenizer(inputCol='text', outputCol='words_token')
df_words_token = tokenizer.transform(df_clean).select("title","words_token")

# stop word는 자연어 처리에서 나온 개념. 의미없는 단어 삭제. (I, my.. )
remover = ml.StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_wo_stopWords = remover.transform(df_words_token).select('title','words_clean')


In [9]:
df_wo_stopWords_space = df_wo_stopWords.select()

df_wo_stopWords.show(2)

+----------------+--------------------+
|           title|         words_clean|
+----------------+--------------------+
| Bongard problem|[, , file, bongar...|
|Generative model|[, , generative, ...|
+----------------+--------------------+
only showing top 2 rows



In [10]:
# python udf은 row-a-time으로 느리다. 
# pandas udf: The input and output series must have the same size.
from pyspark.sql.functions import pandas_udf,PandasUDFType
from pyspark.sql.types import ArrayType,StringType
import pyspark.sql.column as c
import pandas as pd


#pandas_udf 안에는 return data type, https://spark.apache.org/docs/latest/sql-ref-datatypes.html

# def remove_empty_word(input_array: pd.Series) -> pd.Series:
# @pandas_udf(ArrayType(StringType()))
# def remove_empty_word(input_array: pd.Series) -> pd.Series:
#     return pd.Series(input_array)

@pandas_udf(ArrayType(StringType()))
def remove_empty_word(input_array: pd.Series) -> pd.Series:
    import os
    os.environ['ARROW_PRE_0_15_IPC_FORMAT']='1'
    return_list = []
    
    for word_list in input_array:
        tmp_list = []
        for word in word_list:
            if word != '':
                tmp_list.append(word)
        return_list.append(tmp_list)
    return pd.Series(return_list)


removed = remove_empty_word(df_wo_stopWords.words_clean)
df_words_wo_empty = df_wo_stopWords.withColumn('refined_text',removed).select('title','refined_text')  
df_words_wo_empty.show(2)

+----------------+--------------------+
|           title|        refined_text|
+----------------+--------------------+
| Bongard problem|[file, bongard, p...|
|Generative model|[generative, mode...|
+----------------+--------------------+
only showing top 2 rows



# pyarrow의 버전문제(?)로 한 3시간을 고생했다. 
# pyarrow.. 설명
# https://stackoverflow.com/questions/58458415/pandas-scalar-udf-failing-illegalargumentexception
# https://issues.apache.org/jira/browse/SPARK-29367

In [11]:
from nltk.stem.snowball import SnowballStemmer


@pandas_udf(ArrayType(StringType()))
def word_stem(input_array: pd.Series) -> pd.Series:
    import os

    stemmer = SnowballStemmer(language='english')
    os.environ['ARROW_PRE_0_15_IPC_FORMAT']='1'
    return_list = []
    for array in input_array:
        tmp_list = []
        for word in array:
            tmp_list.append(stemmer.stem(word))

        return_list.append(tmp_list)
    
    return pd.Series(return_list)

stemmed = word_stem(df_words_wo_empty.refined_text)

# stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
stemmed_words = df_words_wo_empty.withColumn("SnowballStemmed", stemmed).drop('refined_text')
stemmed_words.show(2)



+----------------+--------------------+
|           title|     SnowballStemmed|
+----------------+--------------------+
| Bongard problem|[file, bongard, p...|
|Generative model|[generat, model, ...|
+----------------+--------------------+
only showing top 2 rows



In [12]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# hashingTF = HashingTF(inputCol="SnowballStemmed", outputCol="TF", numFeatures=20)
# featurizedData = hashingTF.transform(stemmed_words)

countVectorizer = CountVectorizer(inputCol="SnowballStemmed",
                                      outputCol="termFreqs",
                                      vocabSize=2000)

vocabModel = countVectorizer.fit(stemmed_words)
docTermFreqs = vocabModel.transform(stemmed_words)
docTermFreqs.cache()


DataFrame[title: string, SnowballStemmed: array<string>, termFreqs: vector]

In [13]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol="termFreqs", 
          outputCol="tfidfVec")
idfModel = idf.fit(docTermFreqs)
docTermMatrix = idfModel.transform(docTermFreqs).select("title", "tfidfVec")

In [14]:
docTermMatrix.show()


+--------------------+--------------------+
|               title|            tfidfVec|
+--------------------+--------------------+
|     Bongard problem|(2000,[1,2,3,4,5,...|
|    Generative model|(2000,[0,1,2,3,4,...|
|      Inductive bias|(2000,[1,2,3,4,5,...|
|Category:Bayesian...|(2000,[2,10,14,27...|
|Category:Classifi...|(2000,[2,8,10,13,...|
|Category:Evolutio...|(2000,[2,10,12,13...|
|Semi-supervised l...|(2000,[0,1,2,3,4,...|
|  Learning automaton|(2000,[1,2,3,4,5,...|
|Category:Machine ...|(2000,[2,8,10,45,...|
|Conditional rando...|(2000,[0,1,2,3,4,...|
|Cross-entropy method|(2000,[0,1,2,3,4,...|
|       Concept drift|(2000,[2,3,7,8,10...|
|    Concept learning|(2000,[0,1,2,3,4,...|
|      Robot learning|(2000,[1,2,5,6,7,...|
|Version space lea...|(2000,[0,1,2,3,4,...|
|Evolvability (com...|(2000,[0,2,3,5,7,...|
|Prior knowledge f...|(2000,[0,1,2,7,8,...|
|  Granular computing|(2000,[0,2,3,4,5,...|
|Probability matching|(2000,[0,2,3,4,5,...|
|Structural risk m...|(2000,[0,1

In [15]:
from pyspark.sql.functions import monotonically_increasing_id

docTermFreqswithID = docTermFreqs.withColumn('id', monotonically_increasing_id()).cache()

In [16]:
docTermFreqswithID.show()


+--------------------+--------------------+--------------------+---+
|               title|     SnowballStemmed|           termFreqs| id|
+--------------------+--------------------+--------------------+---+
|     Bongard problem|[file, bongard, p...|(2000,[1,2,3,4,5,...|  0|
|    Generative model|[generat, model, ...|(2000,[0,1,2,3,4,...|  1|
|      Inductive bias|[induct, bias, al...|(2000,[1,2,3,4,5,...|  2|
|Category:Bayesian...|[cat, main, bayes...|(2000,[2,10,14,27...|  3|
|Category:Classifi...|[common, categori...|(2000,[2,8,10,13,...|  4|
|Category:Evolutio...|[common, cat, evo...|(2000,[2,10,12,13...|  5|
|Semi-supervised l...|[machin, learn, b...|(2000,[0,1,2,3,4,...|  6|
|  Learning automaton|[automata, learn,...|(2000,[1,2,3,4,5,...|  7|
|Category:Machine ...|[research, studi,...|(2000,[2,8,10,45,...|  8|
|Conditional rando...|[multipl, issu, c...|(2000,[0,1,2,3,4,...|  9|
|Cross-entropy method|[cross, entropi, ...|(2000,[0,1,2,3,4,...| 10|
|       Concept drift|[predict, an

In [19]:
docTermFreqswithID.printSchema()


root
 |-- title: string (nullable = true)
 |-- SnowballStemmed: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termFreqs: vector (nullable = true)
 |-- id: long (nullable = false)



In [20]:
from pyspark.mllib.linalg.distributed import RowMatrix

from pyspark.mllib.util import MLUtils
vecDF = MLUtils.convertVectorColumnsFromML(docTermMatrix, "tfidfVec")
vecDF.show()

+--------------------+--------------------+
|               title|            tfidfVec|
+--------------------+--------------------+
|     Bongard problem|(2000,[1,2,3,4,5,...|
|    Generative model|(2000,[0,1,2,3,4,...|
|      Inductive bias|(2000,[1,2,3,4,5,...|
|Category:Bayesian...|(2000,[2,10,14,27...|
|Category:Classifi...|(2000,[2,8,10,13,...|
|Category:Evolutio...|(2000,[2,10,12,13...|
|Semi-supervised l...|(2000,[0,1,2,3,4,...|
|  Learning automaton|(2000,[1,2,3,4,5,...|
|Category:Machine ...|(2000,[2,8,10,45,...|
|Conditional rando...|(2000,[0,1,2,3,4,...|
|Cross-entropy method|(2000,[0,1,2,3,4,...|
|       Concept drift|(2000,[2,3,7,8,10...|
|    Concept learning|(2000,[0,1,2,3,4,...|
|      Robot learning|(2000,[1,2,5,6,7,...|
|Version space lea...|(2000,[0,1,2,3,4,...|
|Evolvability (com...|(2000,[0,2,3,5,7,...|
|Prior knowledge f...|(2000,[0,1,2,7,8,...|
|  Granular computing|(2000,[0,2,3,4,5,...|
|Probability matching|(2000,[0,2,3,4,5,...|
|Structural risk m...|(2000,[0,1

In [23]:
type(vecDF.select("tfidfVec"))


pyspark.sql.dataframe.DataFrame

In [30]:
vecRDD = vecDF.select("tfidfVec").rdd.flatMap(lambda x:x)

mat = RowMatrix(vecRDD)
svd = mat.computeSVD(50, computeU=True)


In [32]:
termIds = vocabModel.vocabulary


In [37]:
from pyspark.sql.functions import create_map
docIds = docTermFreqswithID.select(create_map('id', 'title').alias('map'))


In [40]:
docIds.show(5)


+--------------------+
|                 map|
+--------------------+
|[0 -> Bongard pro...|
|[1 -> Generative ...|
|[2 -> Inductive b...|
|[3 -> Category:Ba...|
|[4 -> Category:Cl...|
+--------------------+
only showing top 5 rows



In [41]:
v = svd.V
arr = v.toArray()
print(arr)

[[-5.00858180e-02 -4.35775792e-02 -7.34478182e-01 ...  3.56759872e-02
   2.37933367e-03 -4.18013106e-02]
 [-1.20153559e-01 -3.41520254e-02 -3.08447681e-02 ... -1.37015740e-02
  -7.39789114e-03  5.90371606e-04]
 [ 1.85971700e-18  7.50783037e-18 -1.79453700e-19 ...  6.50199192e-18
   3.71956703e-19 -3.98281096e-18]
 ...
 [-8.78457403e-04 -7.14098445e-04 -3.51210501e-03 ...  1.00068501e-02
   7.24102615e-03  1.51146114e-03]
 [-4.25546919e-03 -1.31132128e-02  2.64652064e-03 ... -1.68947999e-02
  -1.44310037e-02 -3.66791509e-02]
 [-1.78161013e-03 -9.71648805e-05 -1.71628121e-03 ...  5.86748774e-03
   1.67048447e-03  4.57107444e-04]]


In [42]:
transposedArr = arr.transpose()


In [43]:
def topTermsInTopConcepts(svd, numConcepts, numTerms, termIds):
    arr = svd.V.toArray().transpose()
    res = []
    for i,v  in enumerate(arr):
        if( i > numConcepts ): break

        v = list(enumerate(v))
        v.sort(key=lambda x : x[1], reverse=True)
        v = v[0:numTerms]
        v = list((termIds[termId], score) for termId, score in v)
        res.append(v)
    return res

In [44]:
topTermsInTopConcepts(svd, 4, 10, termIds)


[[('learn', 1.8597169995295885e-18),
  ('machin', -1.6035925690269777e-18),
  ('categori', -2.151518488287089e-18),
  ('stub', -7.000211758918947e-05),
  ('leakag', -0.00018164111877602349),
  ('syntaxhighlight', -0.00026571460434930694),
  ('automl', -0.00028923376034158035),
  ('defaultsort', -0.00029098684759717563),
  ('grammat', -0.0003018289912002426),
  ('cohen', -0.0003102629786163099)],
 [('scope', 0.20737308029646853),
  ('col', 0.17076117435749236),
  ('width', 0.16774931818494626),
  ('style', 0.13944109951896713),
  ('dataset', 0.08764291274177796),
  ('et', 0.07907108650953734),
  ('al', 0.07708054580845672),
  ('text', 0.045301999677119204),
  ('imag', 0.04000949418161115),
  ('none', 0.03226686485644428)],
 [('defn', 0.1512680558263076),
  ('gli', 0.0945995509340557),
  ('scope', 0.08570611183432925),
  ('col', 0.06006244542456554),
  ('width', 0.054011334730391075),
  ('style', 0.04656215515172694),
  ('ghat', 0.02959268003578154),
  ('dataset', 0.024496733670962202),


In [215]:
def topDocsInTopConcept(svd, numConcepts, numDocs, docIds):
    u = svd.U
    res = []

    for i, u in enumerate(u.rows.map(lambda i : i.toArray()).collect()):
        if( i > numConcepts ): break
        u = list(enumerate(u))
        u.sort(key=lambda x: x[1], reverse=True)
        u = u[0:numDocs]
        u = list((docIds.collect()[docId][0][docId], score) for docId, score in u)
        res.append(u)
    return res

In [216]:
topDocsInTopConcept(svd, 4, 10, docIds)


[[('Uncertain data', 0.012153971612533395),
  ('Evolvability (computer science)', 0.011101410429071921),
  ('CIML community portal', 0.00883950570724018),
  ('Curse of dimensionality', 0.008634068295918167),
  ('Concept learning', 0.006993182301313434),
  ('Learning with errors', 0.006506466855122496),
  ('Probability matching', 0.006330621971353632),
  ('Ugly duckling theorem', 0.00459418746538158),
  ('Matthews correlation coefficient', 0.004528106337838542),
  ('Conditional random field', 0.0043785429594990375)],
 [('Data pre-processing', 0.05897285911052117),
  ('Matthews correlation coefficient', 0.04545270688773978),
  ('Learning to rank', 0.0431350496466185),
  ('Category:Machine learning researchers', 0.04070487278632756),
  ('Prior knowledge for pattern recognition', 0.040071601265590465),
  ('Granular computing', 0.035673146974228284),
  ('CIML community portal', 0.030504282474660145),
  ('Predictive state representation', 0.028612932855883837),
  ('Category:Learning in compu

In [174]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# hashingTF = HashingTF(inputCol="SnowballStemmed", outputCol="TF", numFeatures=20)
# featurizedData = hashingTF.transform(stemmed_words)

countVectorizer = CountVectorizer(inputCol="SnowballStemmed",
                                      outputCol="termFreqs",
                                      vocabSize=20000)

vocabModel = countVectorizer.fit(stemmed_words)
docTermFreqs = vocabModel.transform(stemmed_words)

idf = IDF(inputCol="TF", outputCol="IDF")
idfModel = idf.fit(featurizedData)


rescaledData = idfModel.transform(featurizedData)


rescaledData.select("title","SnowballStemmed").orderBy('IDF',ascending=True).show()


+--------------------+--------------------+
|               title|     SnowballStemmed|
+--------------------+--------------------+
|Category:Unsuperv...|[cat, main, categ...|
|Category:Datasets...|[categori, datase...|
|Category:Machine ...|[commonscat, cate...|
|Category:Ontology...|[categori, ontolo...|
|Category:Supervis...|[catmain, categor...|
|Category:Semisupe...|[catmain, categor...|
|Category:Applied ...|[catmain, machin,...|
|Multiple-instance...|[redirect, multip...|
|      Validation set|[redirect, train,...|
|Category:Inductiv...|[cat, main, categ...|
|Category:Structur...|[cat, main, struc...|
|Category:Machine ...|[research, studi,...|
|Category:Classifi...|[common, categori...|
|Category:Computat...|[cat, main, compu...|
|Category:Bayesian...|[cat, main, bayes...|
|Category:Cluster ...|[common, cat, clu...|
|Category:Support ...|[catmain, support...|
|Category:Deep lea...|[catmain, deep, l...|
|Category:Latent v...|[categori, statis...|
|Category:Kernel m...|[common, c

In [178]:
rescaledData.withColumn('TF_IDF',rescaledData.IDF * rescaledData.TF)

AnalysisException: "cannot resolve '(`IDF` * `TF`)' due to data type mismatch: '(`IDF` * `TF`)' requires numeric type, not struct<type:tinyint,size:int,indices:array<int>,values:array<double>>;;\n'Project [title#4, SnowballStemmed#244, TF#1160, IDF#1167, (IDF#1167 * TF#1160) AS TF_IDF#1183]\n+- Project [title#4, SnowballStemmed#244, TF#1160, UDF(TF#1160) AS IDF#1167]\n   +- Project [title#4, SnowballStemmed#244, UDF(SnowballStemmed#244) AS TF#1160]\n      +- Project [title#4, SnowballStemmed#244]\n         +- Project [title#4, refined_text#100, word_stem(refined_text#100) AS SnowballStemmed#244]\n            +- Project [title#4, refined_text#100]\n               +- Project [title#4, words_clean#42, remove_empty_word(words_clean#42) AS refined_text#100]\n                  +- Project [title#4, words_clean#42]\n                     +- Project [title#4, words_token#36, UDF(words_token#36) AS words_clean#42]\n                        +- Project [title#4, words_token#36]\n                           +- Project [title#4, text#26, UDF(text#26) AS words_token#36]\n                              +- Project [title#4, lower(regexp_replace(raw_text#13, [^a-zA-Z0-9\\s ],  )) AS text#26]\n                                 +- Project [title#4, raw_text#13]\n                                    +- Project [title#4, _VALUE#10, _VALUE#10 AS raw_text#13]\n                                       +- Project [title#4, revision#3.text._VALUE AS _VALUE#10]\n                                          +- Relation[id#0L,ns#1L,redirect#2,revision#3,title#4] XmlRelation(<function0>,Some(wiki_ml.xml),Map(rowtag -> page, path -> wiki_ml.xml),null)\n"

In [145]:
@pandas_udf(DoubleType())
def multiplication(TF: pd.Series,IDF: pd.Series) -> pd.Series:
    TFIDF = []
    return TFIDF.append(DoubleType(a)*DoubleType(b) for a,b in zip(TF,IDF))
    
rescaledData.printSchema()

root
 |-- title: string (nullable = true)
 |-- SnowballStemmed: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- TF: vector (nullable = true)
 |-- IDF: vector (nullable = true)



In [128]:
test = rescaledData['TF'].cast('double')
rescaledData.show(2)

# rescaledData_cast = rescaledData.withColumn('value_casted' , rescaledData['TF'].cast('double'))
# rescaledData_cast.show(2)

+----------------+--------------------+--------------------+--------------------+
|           title|     SnowballStemmed|                  TF|                 IDF|
+----------------+--------------------+--------------------+--------------------+
| Bongard problem|[file, bongard, p...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|Generative model|[generat, model, ...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
+----------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [168]:
from pyspark.ml.feature import VectorAssembler
# assembler = VectorAssembler(inputCols=["TF"],outputCol="TF_cast")
# rescaledData = assembler.transform(rescaledData)

assembler = VectorAssembler(inputCols=["IDF"],outputCol="IDF_cast")
rescaledData = assembler.transform(rescaledData)

rescaledData.show(2)

+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           title|     SnowballStemmed|                  TF|                 IDF|             TF_cast|            IDF_cast|
+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| Bongard problem|[file, bongard, p...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[15.0,36.0,15.0,1...|[1.85125280061959...|
|Generative model|[generat, model, ...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|[58.0,70.0,77.0,1...|[7.15817749572909...|
+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [171]:
rescaledData.printSchema()

root
 |-- title: string (nullable = true)
 |-- SnowballStemmed: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- TF: vector (nullable = true)
 |-- IDF: vector (nullable = true)
 |-- TF_cast: vector (nullable = true)
 |-- IDF_cast: vector (nullable = true)



In [143]:
col('TF').cast()

Column<b'TF'>

In [135]:
rescaledData['TF'].cast('double')

TypeError: 'Column' object is not callable

In [118]:
rescaledData_test = rescaledData.withColumn("tf_idf",multiplication(rescaledData.TF,rescaledData.IDF))
rescaledData_test.show(2)


Py4JJavaError: An error occurred while calling o2989.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 66.0 failed 1 times, most recent failure: Lost task 0.0 in stage 66.0 (TID 65, localhost, executor driver): java.lang.UnsupportedOperationException: Unsupported data type: struct<type:tinyint,size:int,indices:array<int>,values:array<double>>
	at org.apache.spark.sql.execution.arrow.ArrowUtils$.toArrowType(ArrowUtils.scala:56)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$.toArrowField(ArrowUtils.scala:92)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$$anonfun$toArrowSchema$1.apply(ArrowUtils.scala:116)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$$anonfun$toArrowSchema$1.apply(ArrowUtils.scala:115)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
	at org.apache.spark.sql.types.StructType.foreach(StructType.scala:99)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at org.apache.spark.sql.types.StructType.map(StructType.scala:99)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$.toArrowSchema(ArrowUtils.scala:115)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$2.writeIteratorToStream(ArrowPythonRunner.scala:71)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:346)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:195)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor46.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.UnsupportedOperationException: Unsupported data type: struct<type:tinyint,size:int,indices:array<int>,values:array<double>>
	at org.apache.spark.sql.execution.arrow.ArrowUtils$.toArrowType(ArrowUtils.scala:56)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$.toArrowField(ArrowUtils.scala:92)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$$anonfun$toArrowSchema$1.apply(ArrowUtils.scala:116)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$$anonfun$toArrowSchema$1.apply(ArrowUtils.scala:115)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
	at org.apache.spark.sql.types.StructType.foreach(StructType.scala:99)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at org.apache.spark.sql.types.StructType.map(StructType.scala:99)
	at org.apache.spark.sql.execution.arrow.ArrowUtils$.toArrowSchema(ArrowUtils.scala:115)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$2.writeIteratorToStream(ArrowPythonRunner.scala:71)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:346)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:195)


+--------------------+--------------------+
|               title|     SnowballStemmed|
+--------------------+--------------------+
|List of datasets ...|[use, dmi, date, ...|
|Convolutional neu...|[use, cnn, disamb...|
|       Mixture model|[distinguish, mix...|
|Cross-validation ...|[short, descript,...|
|    Machine learning|[short, descript,...|
|         Time series|[use, american, e...|
|Sparse dictionary...|[br, machin, lear...|
|Glossary of artif...|[short, descript,...|
| Pattern recognition|[pattern, recogni...|
|       Random forest|[machin, learn, t...|
|Formal concept an...|[short, descript,...|
|            Word2vec|[machin, learn, b...|
|Statistical class...|[unsupervis, lear...|
|    Algorithmic bias|[short, descript,...|
|Quantum machine l...|[short, descript,...|
|Dimensionality re...|[short, descript,...|
|Matchbox Educable...|[good, articl, us...|
|Apprenticeship le...|[machin, learn, v...|
|  Granular computing|[granular, comput...|
|Conditional rando...|[multipl, 

참고할 코드가 있음에도 불구하고 이해하는데 꽤나 많은 시간이 필요했다.
withColumn의 input이 iterator로 작동하고, 그것을 이용해 UDF를 사용하는데 꽤나 오래걸렸다