## 일반적인 NLP 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("nlp").getOrCreate() # 데이터프레임 형태로 읽어들임

### Tokenzier

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer #데이터들을 쪼개는 작업들을 함

In [5]:
df = spark.read.csv("NLP.txt",header=False,inferSchema=True,sep="\t")# 텍스트파일을 읽어옴

In [6]:
df.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
|spam|here is a list of...|
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
+----+--------------------+
only showing top 20 rows



In [7]:
df = df.withColumnRenamed("_c0","label").withColumnRenamed("_c1","text")

In [10]:
df.show()

+-----+--------------------+
|label|                text|
+-----+--------------------+
| spam|here is a list of...|
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
+-----+--------------------+
only showing top 20 rows



In [8]:
tokenizer = Tokenizer(inputCol="text",outputCol="words")# input: 쪼개려고 가져오는 컬럼 , output: 결과값 저장
regexTokenizer = RegexTokenizer(inputCol="text",outputCol="words",pattern="\\W")

In [9]:
tokenized = tokenizer.transform(df) ## 실제 데이터프레임에 적용
tokenized.select("words").show(3,truncate=False)# 데이터프레임을 볼 때 잘리지 않고 끝까지 보고싶을 때 => truncate=False
# 중간에 불용어도 몇개 출력함 

+------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                               |
+------------------------------------------------------------------------------------------------------------------------------------+
|[here, is, a, list, of, words]                                                                                                      |
|[go, until, jurong, point,, crazy.., available, only, in, bugis, n, great, world, la, e, buffet..., cine, there, got, amore, wat...]|
|[ok, lar..., joking, wif, u, oni...]                                                                                                |
+------------------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



In [11]:
regexTokenized = regexTokenizer.transform(df)# 불용여 전체를 제거하여 위에보다 더 깨끗하게 출력함
regexTokenized.select("words").show(3,truncate=False) # 이방법을 사용하는게 더 적절함

+---------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                      |
+---------------------------------------------------------------------------------------------------------------------------+
|[here, is, a, list, of, words]                                                                                             |
|[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, cine, there, got, amore, wat]|
|[ok, lar, joking, wif, u, oni]                                                                                             |
+---------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



## Stop words removal

In [12]:
from pyspark.ml.feature import StopWordsRemover # 불용어를 적용할 수 있음 

In [13]:
text_df = regexTokenized
# text_df.show(truncate=False)

In [14]:
remover = StopWordsRemover(inputCol="words",outputCol="cleaned")# 불용어가 들어가면 삭제하는 변수를 생성함
text_df = remover.transform(text_df)# text_df 데이터에서 remover를 적용하여 불용어를 제거

In [15]:
text_df.show() # 이제 이렇게 불용어를 제거한것으로 임베딩을 시작함 !!

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|             cleaned|
+-----+--------------------+--------------------+--------------------+
| spam|here is a list of...|[here, is, a, lis...|       [list, words]|
|  ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok, lar, joking,...|
| spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|
|  ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|
|  ham|Nah I don't think...|[nah, i, don, t, ...|[nah, think, goes...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|
|  ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|
|  ham|As per your reque...|[as, per, your, r...|[per, request, me...|
| spam|WINNER!! As a val...|[winner, as, a, v...|[winner, valued, ...|
| spam|Had your mobile 1...|[had, your, mobil...|[mobile, 11, mont...|
|  ham

## n-grams
- 텍스트 분석을 할 때 불용어를 제거한 데이터를 사용함
- 단어 한개를 사용하는 것보다 n개의 단어를 묶어서 연속적으로 리스트에서 n개의 단어를 묶어서 분석

In [16]:
## n-grams # 텍스트 마이닝
from pyspark.ml.feature import NGram

In [17]:
bigram = NGram(n=2, inputCol="words",outputCol="bigrams") # n = 2: 두개씩 단어를 묶음 
bigram_df = bigram.transform(text_df)
bigram_df.select("bigrams").show(3,truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|bigrams                                                                                                                                                                                                                   |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[here is, is a, a list, list of, of words]                                                                                                                                                                                |
|[go until, until jurong, jurong point, point crazy, crazy available, available only, only in, in bugis, bugis n, n 

## BOW 인코딩


In [18]:
from pyspark.ml.feature import HashingTF, IDF,  Tokenizer

In [19]:
tokenizer = RegexTokenizer(inputCol="text",outputCol="words",pattern="\\W")# 문장들을 토큰화함 -> 앞에서 한 작업 반복
words_df = tokenizer.transform(df)
words_df.show()

+-----+--------------------+--------------------+
|label|                text|               words|
+-----+--------------------+--------------------+
| spam|here is a list of...|[here, is, a, lis...|
|  ham|Go until jurong p...|[go, until, juron...|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|
| spam|Free entry in 2 a...|[free, entry, in,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|
|  ham|Nah I don't think...|[nah, i, don, t, ...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|
|  ham|Even my brother i...|[even, my, brothe...|
|  ham|As per your reque...|[as, per, your, r...|
| spam|WINNER!! As a val...|[winner, as, a, v...|
| spam|Had your mobile 1...|[had, your, mobil...|
|  ham|I'm gonna be home...|[i, m, gonna, be,...|
| spam|SIX chances to wi...|[six, chances, to...|
| spam|URGENT! You have ...|[urgent, you, hav...|
|  ham|I've been searchi...|[i, ve, been, sea...|
|  ham|I HAVE A DATE ON ...|[i, have, a, date...|
| spam|XXXMobileMovieClu...|[xxxmobilemoviecl...|


In [20]:
hashingTF = HashingTF(inputCol="words",outputCol="hashfeature",numFeatures=20) # words를 갖고 20개의 벡터로 표시 선언
hashed_df = hashingTF.transform(words_df)# 데이터프레임에 적용 하여 새로운 데이터 프레임 생성
hashed_df.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|         hashfeature|
+-----+--------------------+--------------------+--------------------+
| spam|here is a list of...|[here, is, a, lis...|(20,[7,8,9,12,15]...|
|  ham|Go until jurong p...|[go, until, juron...|(20,[0,3,4,7,8,10...|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|(20,[3,5,9,10,15]...|
| spam|Free entry in 2 a...|[free, entry, in,...|(20,[0,1,2,3,4,7,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|(20,[1,2,3,8,12,1...|
|  ham|Nah I don't think...|[nah, i, don, t, ...|(20,[0,1,4,5,6,8,...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|(20,[0,2,3,4,5,6,...|
|  ham|Even my brother i...|[even, my, brothe...|(20,[3,5,6,8,9,10...|
|  ham|As per your reque...|[as, per, your, r...|(20,[2,3,4,6,8,9,...|
| spam|WINNER!! As a val...|[winner, as, a, v...|(20,[0,2,3,6,7,8,...|
| spam|Had your mobile 1...|[had, your, mobil...|(20,[1,2,3,4,6,8,...|
|  ham

In [21]:
idf = IDF(inputCol="hashfeature",outputCol="features")
idf_model = idf.fit(hashed_df) # 위에서 만든 값을 적용시킴

In [22]:
rescale = idf_model.transform(hashed_df) # 리스케일링함
rescale.select("label","features").show() # 변수 2개만 출력

+-----+--------------------+
|label|            features|
+-----+--------------------+
| spam|(20,[7,8,9,12,15]...|
|  ham|(20,[0,3,4,7,8,10...|
|  ham|(20,[3,5,9,10,15]...|
| spam|(20,[0,1,2,3,4,7,...|
|  ham|(20,[1,2,3,8,12,1...|
|  ham|(20,[0,1,4,5,6,8,...|
| spam|(20,[0,2,3,4,5,6,...|
|  ham|(20,[3,5,6,8,9,10...|
|  ham|(20,[2,3,4,6,8,9,...|
| spam|(20,[0,2,3,6,7,8,...|
| spam|(20,[1,2,3,4,6,8,...|
|  ham|(20,[1,2,4,5,6,8,...|
| spam|(20,[0,1,3,4,6,7,...|
| spam|(20,[1,2,3,4,5,7,...|
|  ham|(20,[0,1,2,3,4,5,...|
|  ham|(20,[0,7,10,12,16...|
| spam|(20,[0,1,2,3,5,6,...|
|  ham|(20,[1,4,10,15,16...|
|  ham|(20,[0,1,3,4,5,6,...|
|  ham|(20,[0,3,4,5,7,9,...|
+-----+--------------------+
only showing top 20 rows



In [23]:
from pyspark.ml.feature import CountVectorizer

In [24]:
cv = CountVectorizer(inputCol="words",outputCol="features",vocabSize=5, minDF=2.0)
model = cv.fit(words_df)
res = model.transform(words_df)
res.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|            features|
+-----+--------------------+--------------------+--------------------+
| spam|here is a list of...|[here, is, a, lis...|       (5,[3],[1.0])|
|  ham|Go until jurong p...|[go, until, juron...|           (5,[],[])|
|  ham|Ok lar... Joking ...|[ok, lar, joking,...|           (5,[],[])|
| spam|Free entry in 2 a...|[free, entry, in,...| (5,[1,3],[3.0,1.0])|
|  ham|U dun say so earl...|[u, dun, say, so,...|           (5,[],[])|
|  ham|Nah I don't think...|[nah, i, don, t, ...| (5,[0,1],[1.0,1.0])|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|(5,[0,1,2],[1.0,2...|
|  ham|Even my brother i...|[even, my, brothe...|       (5,[1],[1.0])|
|  ham|As per your reque...|[as, per, your, r...|       (5,[1],[1.0])|
| spam|WINNER!! As a val...|[winner, as, a, v...|(5,[1,2,3],[2.0,1...|
| spam|Had your mobile 1...|[had, your, mobil...| (5,[1,4],[2.0,2.0])|
|  ham

#### 수치화 된 값을 갖고 모델생성하고 분석하는 과정을 이어서 진행함