## 일반적인 NLP 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("nlp").getOrCreate()

### Tokenzier

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [5]:
df = spark.read.csv("NLP",header=False,inferSchema=True,sep="\t")  # 일단 SMSSpamCollection로 함

In [6]:
df.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [7]:
df = df.withColumnRenamed("_c0","label").withColumnRenamed("_c1","text")

In [11]:
tokenizer = Tokenizer(inputCol="text",outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="text",outputCol="words",pattern="\\W")

In [12]:
tokenized = tokenizer.transform(df)
tokenized.select("words").show(3,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                   |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go, until, jurong, point,, crazy.., available, only, in, bugis, n, great, world, la, e, buffet..., cine, there, got, amore, wat...]                                                    |
|[ok, lar..., joking, wif, u, oni...]                                                                                                                                                    |
|[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkt

In [13]:
regexTokenized = regexTokenizer.transform(df)
regexTokenized.select("words").show(3,truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, cine, there, got, amore, wat]                                                                 |
|[ok, lar, joking, wif, u, oni]                                                                                                                                                              |
|[free, entry, in, 2, a, wkly, comp, to, win,

## Stop words removal

In [14]:
from pyspark.ml.feature import StopWordsRemover

In [15]:
text_df = regexTokenized
# text_df.show(truncate=False)

In [16]:
remover = StopWordsRemover(inputCol="words",outputCol="cleaned")
text_df = remover.transform(text_df)

In [17]:
text_df.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|             cleaned|
+-----+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok, lar, joking,...|
| spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|
|  ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|
|  ham|Nah I don't think...|[nah, i, don, t, ...|[nah, think, goes...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|
|  ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|
|  ham|As per your reque...|[as, per, your, r...|[per, request, me...|
| spam|WINNER!! As a val...|[winner, as, a, v...|[winner, valued, ...|
| spam|Had your mobile 1...|[had, your, mobil...|[mobile, 11, mont...|
|  ham|I'm gonna be home...|[i, m, gonna, be,...|[m, gonna, home, ...|
| spam

## n-grams

In [18]:
## n-grams
from pyspark.ml.feature import NGram

In [19]:
bigram = NGram(n=2, inputCol="words",outputCol="bigrams")
bigram_df = bigram.transform(text_df)
bigram_df.select("bigrams").show(3,truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|bigrams                                                                                                                                                                                                                                                                                                                                        |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## BOW 인코딩


In [20]:
from pyspark.ml.feature import HashingTF, IDF,  Tokenizer


In [21]:
tokenizer = RegexTokenizer(inputCol="text",outputCol="words",pattern="\\W")
words_df = tokenizer.transform(df)
words_df.show()

+-----+--------------------+--------------------+
|label|                text|               words|
+-----+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|
| spam|Free entry in 2 a...|[free, entry, in,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|
|  ham|Nah I don't think...|[nah, i, don, t, ...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|
|  ham|Even my brother i...|[even, my, brothe...|
|  ham|As per your reque...|[as, per, your, r...|
| spam|WINNER!! As a val...|[winner, as, a, v...|
| spam|Had your mobile 1...|[had, your, mobil...|
|  ham|I'm gonna be home...|[i, m, gonna, be,...|
| spam|SIX chances to wi...|[six, chances, to...|
| spam|URGENT! You have ...|[urgent, you, hav...|
|  ham|I've been searchi...|[i, ve, been, sea...|
|  ham|I HAVE A DATE ON ...|[i, have, a, date...|
| spam|XXXMobileMovieClu...|[xxxmobilemoviecl...|
|  ham|Oh k...i'm watchi...|[oh, k, i, m, wat...|


In [22]:
hashingTF = HashingTF(inputCol="words",outputCol="hashfeature",numFeatures=20)
hashed_df = hashingTF.transform(words_df)
hashed_df.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|         hashfeature|
+-----+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|(20,[0,3,4,7,8,10...|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|(20,[3,5,9,10,15]...|
| spam|Free entry in 2 a...|[free, entry, in,...|(20,[0,1,2,3,4,7,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|(20,[1,2,3,8,12,1...|
|  ham|Nah I don't think...|[nah, i, don, t, ...|(20,[0,1,4,5,6,8,...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|(20,[0,2,3,4,5,6,...|
|  ham|Even my brother i...|[even, my, brothe...|(20,[3,5,6,8,9,10...|
|  ham|As per your reque...|[as, per, your, r...|(20,[2,3,4,6,8,9,...|
| spam|WINNER!! As a val...|[winner, as, a, v...|(20,[0,2,3,6,7,8,...|
| spam|Had your mobile 1...|[had, your, mobil...|(20,[1,2,3,4,6,8,...|
|  ham|I'm gonna be home...|[i, m, gonna, be,...|(20,[1,2,4,5,6,8,...|
| spam

In [23]:
idf = IDF(inputCol="hashfeature",outputCol="features")
idf_model = idf.fit(hashed_df)

In [24]:
rescale = idf_model.transform(hashed_df)
rescale.select("label","features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  ham|(20,[0,3,4,7,8,10...|
|  ham|(20,[3,5,9,10,15]...|
| spam|(20,[0,1,2,3,4,7,...|
|  ham|(20,[1,2,3,8,12,1...|
|  ham|(20,[0,1,4,5,6,8,...|
| spam|(20,[0,2,3,4,5,6,...|
|  ham|(20,[3,5,6,8,9,10...|
|  ham|(20,[2,3,4,6,8,9,...|
| spam|(20,[0,2,3,6,7,8,...|
| spam|(20,[1,2,3,4,6,8,...|
|  ham|(20,[1,2,4,5,6,8,...|
| spam|(20,[0,1,3,4,6,7,...|
| spam|(20,[1,2,3,4,5,7,...|
|  ham|(20,[0,1,2,3,4,5,...|
|  ham|(20,[0,7,10,12,16...|
| spam|(20,[0,1,2,3,5,6,...|
|  ham|(20,[1,4,10,15,16...|
|  ham|(20,[0,1,3,4,5,6,...|
|  ham|(20,[0,3,4,5,7,9,...|
| spam|(20,[1,6,7,8,10,1...|
+-----+--------------------+
only showing top 20 rows



In [25]:
from pyspark.ml.feature import CountVectorizer

In [26]:
cv = CountVectorizer(inputCol="words",outputCol="features",vocabSize=5, minDF=2.0)
model = cv.fit(words_df)
res = model.transform(words_df)
res.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|            features|
+-----+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|           (5,[],[])|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|           (5,[],[])|
| spam|Free entry in 2 a...|[free, entry, in,...| (5,[1,3],[3.0,1.0])|
|  ham|U dun say so earl...|[u, dun, say, so,...|           (5,[],[])|
|  ham|Nah I don't think...|[nah, i, don, t, ...| (5,[0,1],[1.0,1.0])|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|(5,[0,1,2],[1.0,2...|
|  ham|Even my brother i...|[even, my, brothe...|       (5,[1],[1.0])|
|  ham|As per your reque...|[as, per, your, r...|       (5,[1],[1.0])|
| spam|WINNER!! As a val...|[winner, as, a, v...|(5,[1,2,3],[2.0,1...|
| spam|Had your mobile 1...|[had, your, mobil...| (5,[1,4],[2.0,2.0])|
|  ham|I'm gonna be home...|[i, m, gonna, be,...| (5,[0,1],[3.0,1.0])|
| spam