In [1]:
import numpy as np
import pandas as pd
import pyspark
import urllib

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *

In [2]:
spark = SparkSession.builder.appName('example').getOrCreate()

In [23]:
import urllib
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz"
urllib.request.urlretrieve(URL, './20_newsgroups.tar.gz')

('./20_newsgroups.tar.gz', <http.client.HTTPMessage at 0x408342fb80>)

In [10]:
# binaryFiles reads all files in the given path - returning [(filenames, bytes) ...]

spark.sparkContext.binaryFiles("./").map(lambda x: (x[0], len(x[1]))).collect()

[('file:/home/jovyan/work/Iris_example.ipynb', 11758),
 ('file:/home/jovyan/work/Untitled.ipynb', 3531),
 ('file:/home/jovyan/work/iris.csv', 4551),
 ('file:/home/jovyan/work/test.txt', 14)]

In [79]:
import tarfile
from io import BytesIO

def extract_files(data):
    filename, bytes = data
    tar = tarfile.open(fileobj=BytesIO(bytes), mode="r:gz")
    for x in tar:
        if not x.isfile():
            continue
        yield (x.name, tar.extractfile(x).read())

data = (spark.sparkContext.binaryFiles("./20_newsgroups.tar.gz")
        .flatMap(extract_files)
        .mapValues(lambda x: x.decode("latin-1")))

In [81]:
data.map(lambda x: (x[0], len(x[1]))).take(10)

[('20_newsgroups/alt.atheism/53366', 1926),
 ('20_newsgroups/alt.atheism/53367', 2456),
 ('20_newsgroups/alt.atheism/51247', 2144),
 ('20_newsgroups/alt.atheism/51248', 929),
 ('20_newsgroups/alt.atheism/51249', 1976),
 ('20_newsgroups/alt.atheism/51250', 3325),
 ('20_newsgroups/alt.atheism/51251', 1421),
 ('20_newsgroups/alt.atheism/51252', 2310),
 ('20_newsgroups/alt.atheism/51253', 3664),
 ('20_newsgroups/alt.atheism/51254', 3392)]

In [82]:
# Decent way of seeing the lineage structure of RDDs
print(data.toDebugString().decode('utf8'))

(1) PythonRDD[73] at RDD at PythonRDD.scala:53 []
 |  ./20_newsgroups.tar.gz BinaryFileRDD[71] at binaryFiles at <unknown>:0 []


In [83]:
# Convert RDD to dataframe

df = data.toDF(['filename', 'bytes'])

In [84]:
df.show(10)

+--------------------+--------------------+
|            filename|               bytes|
+--------------------+--------------------+
|20_newsgroups/alt...|Path: cantaloupe....|
|20_newsgroups/alt...|Xref: cantaloupe....|
|20_newsgroups/alt...|Newsgroups: alt.a...|
|20_newsgroups/alt...|Xref: cantaloupe....|
|20_newsgroups/alt...|Path: cantaloupe....|
|20_newsgroups/alt...|Newsgroups: alt.a...|
|20_newsgroups/alt...|Newsgroups: alt.a...|
|20_newsgroups/alt...|Xref: cantaloupe....|
|20_newsgroups/alt...|Newsgroups: alt.a...|
|20_newsgroups/alt...|Path: cantaloupe....|
+--------------------+--------------------+
only showing top 10 rows

