<a href="https://colab.research.google.com/github/vvarshne/Spoon-Knife/blob/master/BertPySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)
Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.6
Collecting spark-nlp
  Using cached https://files.pythonhosted.org/packages/4e/b9/2fad4ac1c115dbd1487627e4a95458f9aa5a6a641798d5d10d1ce3eb61bd/spark_nlp-2.6.1-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.1


In [3]:
import sparknlp 

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.6.1
Apache Spark version:  2.4.6


In [4]:
import nltk

nltk.download('brown')
nltk.download('stopwords')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
! mkdir -p data

In [8]:
import os

if not os.path.exists('mini_newsgroups.tar.gz'):
    ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz
    ! tar xzf mini_newsgroups.tar.gz -C ./data/

--2020-09-25 20:02:17--  https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1860687 (1.8M) [application/x-httpd-php]
Saving to: ‘mini_newsgroups.tar.gz’


2020-09-25 20:02:18 (3.94 MB/s) - ‘mini_newsgroups.tar.gz’ saved [1860687/1860687]



In [11]:
from collections import defaultdict, Counter, OrderedDict

import numpy as np
import pandas as pd
import scipy.sparse as sparse

from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from nltk.corpus import brown
en_stopwords = set(stopwords.words('english'))

In [12]:
def detokenize(sentence):
    text = ''
    for token in sentence:
        if text and any(c.isalnum() for c in token):
            text += ' '
        text += token
    return text

In [13]:
def process(sentence):
    terms = []
    for term in sentence:
        term = term.lower()
        if term not in en_stopwords and term.isalnum():
            terms.append(term)
    return terms

In [15]:

docs = OrderedDict()

for fid in brown.fileids():
    docs[fid] = brown.sents(fid)

In [16]:
ix2doc = list(docs)
doc2ix = {fid: i for i, fid in enumerate(ix2doc)}
vocabulary = set()

term_counts = defaultdict(Counter)
document_counts = Counter()

for fid, doc in docs.items():
    unique_terms = set()
    for sentence in doc:
        sentence = process(sentence)
        term_counts[fid].update(sentence)
        unique_terms.update(sentence)
    document_counts.update(unique_terms)
    vocabulary.update(unique_terms)

ix2term = sorted(list(vocabulary))
term2ix = OrderedDict()
for i, term in enumerate(ix2term):
    term2ix[term] = i

In [17]:
term_count_mat = sparse.dok_matrix((len(doc2ix), len(term2ix)))

for fid, i in doc2ix.items():
    for term, count in term_counts[fid].items():
        j = term2ix[term]
        term_count_mat[i, j] = count
term_count_mat = term_count_mat.todense()

doc_count_vec = np.array(
    [document_counts[term] for term in term2ix.keys()])

In [18]:
tf = np.log(term_count_mat + 1)
idf = len(doc2ix) / (1 + doc_count_vec)

tfidf = np.multiply(tf, idf)

In [19]:
tfidf.shape

(500, 40881)