# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION AND PROCESSING TECHNIQUES
## Part IV. Spark intro

### 1. Libraries and Spark setup

In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import IndexToString, StringIndexer, OneHotEncoder
from  pyspark.ml.stat import Summarizer

In [None]:
print('user:', os.environ['JUPYTERHUB_SERVICE_PREFIX'])


def uiWebUrl(self):
    from urllib.parse import urlparse
    web_url = self._jsc.sc().uiWebUrl().get()
    port = urlparse(web_url).port
    return '{}proxy/{}/jobs/'.format(os.environ['JUPYTERHUB_SERVICE_PREFIX'], port)


SparkContext.uiWebUrl = property(uiWebUrl)

conf = SparkConf()
conf.set('spark.master', 'local[*]')
conf.set('spark.driver.memory', '16G')
conf.set('spark.driver.maxResultSize', '8G')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark

### 2. Data load with Spark

In [None]:
os.listdir('articles_data')

In [None]:
file_path = './articles_data/*.json'
sdf = spark.read.json(file_path)

In [None]:
sdf.printSchema()

In [None]:
sdf.count()

In [None]:
sdf.limit(5).toPandas()

### 3. Texts processing

In [None]:
sdf = sdf.withColumn('file', F.input_file_name())
sdf.limit(2).toPandas()

In [None]:
sdf = sdf.withColumn(
    'file', 
    F.regexp_replace(
        'file', 
        'file:///home/jovyan/apbdid_23/topic_2/articles_data/articles_lbl_', 
        '')
)
sdf = sdf.withColumn(
    'label', 
    F.regexp_replace(
        'file', 
        '.json', 
        '')
)
sdf.limit(5).toPandas()

In [None]:
sdf = sdf.select(
    sdf.label,
    F.explode(sdf.articles)
)
sdf.limit(5).toPandas()

In [None]:
sdf.printSchema()

In [None]:
def flat_df(df, prefix=None):
    flat_cols = [c[0] for c in df.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in df.dtypes if c[1][:6] == 'struct']
    flat_df = df.select(
        flat_cols + 
        [F.col(ncol + '.' + col).alias(prefix + col if prefix else ncol + '_' + col )
         for ncol in nested_cols
         for col in df.select(ncol + '.*').columns]
    )
    return flat_df


sdf = flat_df(sdf, prefix='')
sdf.limit(5).toPandas()

In [None]:
sdf_ds = sdf.select(
    sdf.label,
    sdf.col_name.alias('title'),
    sdf.col_annotation.alias('annotation'),
    sdf.col_link.alias('url')
)
sdf_ds.limit(5).toPandas()

In [None]:
sdf_ds.groupBy('label').count().show()

In [None]:
# is there a multilabel problem?
sdf \
    .groupby(['col_link']) \
    .count() \
    .where('count > 1') \
    .sort('count', ascending=False) \
    .count()

In [None]:
indexer = StringIndexer(inputCol='label', outputCol='category')
model = indexer.fit(sdf_ds)
sdf_ds = model.transform(sdf_ds)
sdf_ds.limit(5).toPandas()

In [None]:
encoder = OneHotEncoder(
    inputCols=['category'],
    outputCols=['vec']
)
model = encoder.fit(sdf_ds)
sdf_ds = model.transform(sdf_ds)
sdf_ds.limit(5).toPandas()

In [None]:
sdf_ds.count()

In [None]:
sdf_ds_ = sdf_ds \
    .groupBy(sdf_ds.url, sdf_ds.title, sdf_ds.annotation) \
    .agg(Summarizer.sum(sdf_ds.vec).alias('target'))

In [None]:
sdf_ds_.limit(5).toPandas()

In [None]:
sdf_ds_.count()

### 4. Save to file

In [None]:
file_path_ds = 'articles_data/arcicles_dataset.csv'
sdf_ds_.toPandas().to_csv(file_path_ds)