# Introduction to Big Data Modern Technologies course

## TOPIC 5: Data lake concept and tools
### Part 1. Apache Spark for data analysis

### 1. Libraries and credentials

In [None]:
import os
import sys
import json
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
def access_data(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data

creds = access_data(file_path='access_bucket.json')
print(creds.keys())

<font color='red'>__IMPORTANT NOTE__</font>
1. Do not set credentials (keys, secrets, passwords) explicitly in your code
2. Do not print out variables with credentials in ypur code

### 2. Data files in object storage

We will use test data from VK social platform. Data was collected through [VK API](https://dev.vk.com/reference).

There are three groups in the bucket:
- gsom_abiturient
- gsom_ma
- gsom.spbu

Data is updated every week, last available data is (names of the folders):
- gsom_abiturient-2022-12-20-05-00-20-364747
- gsom_ma-2022-12-19-05-00-08-338606
- gsom_spbu-2022-12-14-05-00-08-207227

Structure in every folder is as follows:
- `/walls` folder with walls data of members oh the group
- `<GROUP_NAME>.json` (e.g.`gsom_abiturient.json`) file with the group description
- `members_full_group_<GROUP_NAME>.json` (e.g. `members_full_group_gsom_abiturient.json`) file with the full data for members of the group
- `members_group_<GROUP_NAME>.json` (e.g. `members_group_gsom_abiturient.json`) file with the list of group's members
- `wall_owner_id_<GROUP_ID>.json` (e.g. `wall_owner_id_23777199.json`) file with wall of the group

#### 2.1. Session and client to access files

[About boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) library.

In [None]:
session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    aws_access_key_id=creds['aws_access_key_id'],
    aws_secret_access_key=creds['aws_secret_access_key'],
    endpoint_url='https://storage.yandexcloud.net'
)

In [None]:
VK_DATA_BUCKET = 'apid-data-vk'

In [None]:
all_files = [key['Key'] for key in s3.list_objects(Bucket=VK_DATA_BUCKET)['Contents']]
print('files in storage:', all_files[:10]) # works only for num of files < 1000

In [None]:
len(all_files)

In [None]:
def get_all_s3_objects(s3, bucket, upfolder, verbose=False):
    s3_result = s3.list_objects_v2(Bucket=bucket, Prefix=upfolder)
    loaded = []
    if 'Contents' in s3_result.keys():
        for key in s3_result['Contents']:
            loaded.append(key['Key'])
    else:
        loaded = []
    if verbose: print(f'loaded: {len(loaded)}')
    while s3_result['IsTruncated']:
        continuation_key = s3_result['NextContinuationToken']
        s3_result = s3.list_objects_v2(
            Bucket=bucket, 
            Prefix=upfolder, 
            ContinuationToken=continuation_key
        )
        for key in s3_result['Contents']:
            loaded.append(key['Key'])
        if verbose: print(f'loaded: {len(loaded)}')
    return loaded

In [None]:
all_files = get_all_s3_objects(s3, bucket=VK_DATA_BUCKET, upfolder='')

In [None]:
len(all_files)

#### 2.2. Data structure

In [None]:
[x for x in all_files if 'gsom_abiturient' in x]

In [None]:
set([x.split('/')[0] for x in all_files if 'gsom_abiturient' in x])

In [None]:
# last data for `gsom_abiturient`
sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_abiturient' in x]), 
    reverse=True
)[0]

In [None]:
# last data for `gsom_ma`
sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_ma' in x]), 
    reverse=True
)[0]

In [None]:
# last data for `gsom_spbu`
sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_spbu' in x]), 
    reverse=True
)[0]

#### 2.3. Load data from the storage

In [None]:
last_folder = sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_ma' in x]), 
    reverse=True
)[0]

file_to_load = last_folder + '/gsom_ma.json'
print('file to load:', file_to_load)

In [None]:
print('file to load:', file_to_load)
get_object_response = s3.get_object(
    Bucket=VK_DATA_BUCKET, 
    Key=file_to_load
)

In [None]:
get_object_response

In [None]:
data = json.load(get_object_response['Body'])
type(data)

In [None]:
len(data)

In [None]:
data[0]

In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
data[0].keys()

In [None]:
data[0]['name']

In [None]:
data[0]['description']

In [None]:
data[0]['status']

### 4. Time for Spark

#### 4.1. Read data from S3

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf, struct, count_distinct, from_unixtime

In [None]:
# web UI for the Spark

def uiWebUrl(self):
    from urllib.parse import urlparse
    web_url = self._jsc.sc().uiWebUrl().get()
    port = urlparse(web_url).port
    return '{}proxy/{}/jobs/'.format(os.environ['JUPYTERHUB_SERVICE_PREFIX'], port)

SparkContext.uiWebUrl = property(uiWebUrl)

# Spark settings
conf = SparkConf()
conf.set('spark.master', 'local[*]')    # max 5 cores available, use `local[*]` for all cores
conf.set('spark.driver.memory', '16G')  # max 16 GB available
conf.set('spark.driver.maxResultSize', '4G')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# Spark's access for object storage settings
spark._jsc.hadoopConfiguration().set('fs.s3a.access.key', creds['aws_access_key_id'])
spark._jsc.hadoopConfiguration().set('fs.s3a.secret.key', creds['aws_secret_access_key'])
spark._jsc.hadoopConfiguration().set('fs.s3a.impl','org.apache.hadoop.fs.s3a.S3AFileSystem')
spark._jsc.hadoopConfiguration().set('fs.s3a.multipart.size', '104857600')
spark._jsc.hadoopConfiguration().set('fs.s3a.block.size', '33554432')
spark._jsc.hadoopConfiguration().set('fs.s3a.threads.max', '256')
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://storage.yandexcloud.net')
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 
                                     'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')

spark

In [None]:
all_files[:10]

In [None]:
all_files[3]

Spark can [read JSON](https://spark.apache.org/docs/latest/sql-data-sources-json.html):

In [None]:
file_path = f's3a://{VK_DATA_BUCKET}/' + all_files[3]
sdf = spark.read.json(file_path)

#### 4.2. Basic data processing

Schema is your primary info tool for Spark dataframe:

In [None]:
sdf.printSchema()

In [None]:
sdf.show()

##### Selecting columns

In [None]:
sdf.columns

In [None]:
sdf.select('count').show()

In [None]:
sdf.select('profiles').show()

In [None]:
sdf.select('count', 'profiles').show()

In [None]:
sdf.select(sdf.profiles).show()

In [None]:
# does not work 
# why?
sdf.select(sdf.count, sdf.profiles).show()

In [None]:
# here is the answer
sdf.count()

In [None]:
# that's better
sdf.select(sdf['count'], sdf.profiles).show()

##### How to deal with `array`

Just `explode` them with `pyspark.sql` functions...

In [None]:
F.explode(sdf.items)

In [None]:
sdf.select(F.explode(sdf.items))

In [None]:
sdf.select(F.explode(sdf.items)).show()

In [None]:
sdf.select(F.explode(sdf.groups)).show()

In [None]:
sdf_exploded = sdf.select(F.explode(sdf.groups).alias('groups'))
sdf_exploded.show()

In [None]:
sdf_exploded.printSchema()

In [None]:
sdf_exploded.limit(5).toPandas()

In [None]:
# many `explode`s
# do not work
sdf_exploded = sdf.select(
    F.explode(sdf.groups),
    F.explode(sdf.items)
)
sdf_exploded.show()

In [None]:
sdf.printSchema()

In [None]:
# but you can `explode` one by one
# `explode` groups at the first step
sdf = sdf.select(
    F.explode(sdf.groups).alias('groups'),
    sdf.items
)
sdf.show()

In [None]:
sdf.printSchema()

In [None]:
# `explode` items at the second step
sdf = sdf.select(
    sdf.groups,
    F.explode(sdf.items).alias('items')
)
sdf.show()

In [None]:
sdf.printSchema()

##### How to deal with `struct`

In [None]:
sdf.columns

In [None]:
sdf.groups

In [None]:
sdf.dtypes

In [None]:
# let's start from `groups` column
sdf.select('groups.*')

In [None]:
sdf.select('groups.*').columns

In [None]:
sdf.select('groups.*').show()

In [None]:
sdf.select(
    'groups.*', 
    'items.*'
).show()

In [None]:
sdf.select(
    'groups.*', 
    'items.*'
).limit(5).toPandas()

What can be problems with the 'unstructuring'? For example:
- dataframe can contain mixed types of columns (`nested` aka `struct` and plain columns)
- same sub-columns names within different structures

Function to solve this problem is below:

In [None]:
def flat_df(df, prefix=None):
    """
    Extracts ata from Struct type colums
    of tha Spark dataframe. Function works
    with columns and nested or `struct` type
    columns, but does not `explode` array 
    type columns.
    
    Can take `prefix` to name nested colums.
    
    """
    flat_cols = [c[0] for c in df.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in df.dtypes if c[1][:6] == 'struct']
    flat_df = df.select(
        flat_cols + 
        [F.col(ncol + '.' + col).alias(prefix + col if prefix else ncol + '_' + col ) 
         for ncol in nested_cols 
         for col in df.select(ncol + '.*').columns]
    )
    return flat_df

In [None]:
sdf_unstructured = flat_df(sdf, prefix=None)
sdf_unstructured.printSchema()

In [None]:
sdf_unstructured.limit(5).toPandas()

##### Conversion to timestamps

In [None]:
sdf_unstructured = sdf_unstructured.withColumn(
    'items_date',
    F.to_timestamp('items_date')
)

In [None]:
sdf_unstructured.limit(5).toPandas()

##### Value counts and group by operations

In [None]:
sdf_unstructured.groupBy('items_owner_id').count().orderBy('count').collect()

In [None]:
sdf_unstructured.groupBy('items_post_type').count().orderBy('count').toPandas()

In [None]:
sdf_unstructured.groupBy('groups_screen_name').count().orderBy('count').toPandas()

In [None]:
sdf_unstructured.groupBy('items_id').count().orderBy('count').toPandas()

##### Filtering

In [None]:
sdf_unstructured.filter(sdf_unstructured.items_id == 2509).toPandas()

In [None]:
sdf_unstructured.filter(
    (sdf_unstructured.items_id == 2509) &
    (sdf_unstructured.groups_name.contains('СПбГУ'))
).toPandas()

#### 4.3. Data processing for EDA (Exploratory Data Analysis)

##### Read many files at once

In [None]:
# last data for `gsom_abiturient`
sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_abiturient' in x]), 
    reverse=True
)[0]

In [None]:
# last data for `gsom_ma`
sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_ma' in x]), 
    reverse=True
)[0]

In [None]:
# last data for `gsom_spbu`
sorted(
    set([x.split('/')[0] for x in all_files if 'gsom_spbu' in x]), 
    reverse=True
)[0]

We can pass a list of files to Spark reader:

In [None]:
file_path = [
    f's3a://{VK_DATA_BUCKET}/gsom_spbu-2022-11-16-05-00-11-755938/wall_owner_id_*.json',
    f's3a://{VK_DATA_BUCKET}/gsom_ma-2022-11-14-05-00-20-220713/wall_owner_id_*.json',
    f's3a://{VK_DATA_BUCKET}/gsom_abiturient-2022-11-15-05-00-12-366823//wall_owner_id_*.json'
]
sdf = spark.read.json(file_path)

...or can pass a mask for files' names to Spark reader:

In [None]:
file_mask = f's3a://{VK_DATA_BUCKET}/gsom_*-2022-11-1*/wall_owner_id_*.json'
sdf = spark.read.json(file_mask)

In [None]:
sdf.printSchema()

In [None]:
sdf.show()

##### Creating dataset

In [None]:
sdf = sdf.select(
    F.explode(sdf.groups).alias('groups'), 
    sdf.items
)
sdf = flat_df(sdf, prefix='')
sdf.limit(5).toPandas()

In [None]:
sdf = sdf.select(
    sdf.groups_id, 
    sdf.groups_name, 
    sdf.groups_type,
    F.explode(sdf.items).alias('items')
)
sdf = flat_df(sdf, prefix='')
sdf.limit(5).toPandas()

In [None]:
sdf.groupBy('groups_name').count().show()

In [None]:
sdf.groupBy('groups_name', 'groups_type').count().show()

In [None]:
sdf = sdf.withColumn(
    'items_date',
    F.to_timestamp('items_date')
)

In [None]:
sdf_posts = sdf.select(
    sdf.items_date,
    sdf.items_comments,
    sdf.items_likes,
    sdf.items_reposts,
    sdf.items_views,
    sdf.items_text,
    sdf.groups_name,
    sdf.groups_type
).dropDuplicates()
sdf_posts.limit(5).toPandas()

In [None]:
sdf_posts.printSchema()

In [None]:
sdf_posts = flat_df(sdf_posts, prefix='')
sdf_posts.limit(5).toPandas()

In [None]:
sdf_posts = sdf_posts.select(
    sdf_posts.items_date,
    sdf_posts.items_text,
    sdf_posts.items_comments_count,
    sdf_posts.items_likes_count,
    sdf_posts.items_reposts_count
).dropDuplicates()
sdf_posts.limit(5).toPandas()

In [None]:
sdf_posts.count()

### 5. EDA with Spark

#### 5.1. Top posts

In [None]:
# by comments
sdf_posts.sort(sdf_posts.items_comments_count.desc()).limit(5).toPandas()

In [None]:
# by likes
sdf_posts.sort(sdf_posts.items_likes_count.desc()).limit(5).toPandas()

In [None]:
# by reposts
sdf_posts.sort(sdf_posts.items_reposts_count.desc()).limit(5).toPandas()

#### 5.2. Time trends

In [None]:
sdf_posts.sort('items_date')

In [None]:
# likes by days
time_axis = [
    x.items_date 
    for x in sdf_posts.sort('items_date').select('items_date').collect()
]
likes_count = [
    x.items_likes_count 
    for x in sdf_posts.sort('items_date').select('items_likes_count').collect()
]

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(time_axis, likes_count)
plt.ylabel('likes')
plt.xlabel('timestamp')
plt.title('Likes over the time')
plt.legend(['likes'], loc='upper left')
plt.show()

In [None]:
# reposts by days
reposts_count = [
    x.items_reposts_count 
    for x in sdf_posts.sort('items_date').select('items_reposts_count').collect()
]

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(time_axis, reposts_count)
plt.ylabel('likes')
plt.xlabel('timestamp')
plt.title('Reposts over the time')
plt.legend(['reposts'], loc='upper left')
plt.show()

In [None]:
# comments by day of week
sdf_posts = sdf_posts.withColumn('items_date_dow', F.dayofweek('items_date'))
sdf_posts.limit(5).toPandas()

In [None]:
comments_dow = sdf_posts.groupBy('items_date_dow').sum().collect()

In [None]:
comments_dow

In [None]:
comments_dow = [(x['items_date_dow'], x['sum(items_comments_count)']) for x in comments_dow]
comments_dow

In [None]:
dow = [x[0] for x in comments_dow]
comments = [x[1] for x in comments_dow]

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(dow, comments)
plt.ylabel('comments')
plt.xlabel('day of week')
plt.title('Comments bt day of the week')
plt.legend(['comments'], loc='upper left')
plt.show()

#### 5.3. Correlations

In [None]:
# reposts vs likes
likes = [
    x.items_likes_count 
    for x in sdf_posts.sort('items_date').select('items_likes_count').collect()
]
reposts = [
    x.items_reposts_count 
    for x in sdf_posts.sort('items_date').select('items_reposts_count').collect()
]

In [None]:
plt.figure(figsize=(16, 6))
plt.scatter(likes, reposts)
plt.ylabel('reposts')
plt.xlabel('likes')
plt.title('Reposts vs Likes')
plt.show()

In [None]:
# comments and likes
comments = [
    x.items_comments_count 
    for x in sdf_posts.sort('items_date').select('items_comments_count').collect()
]

In [None]:
plt.figure(figsize=(16, 6))
plt.scatter(likes, comments)
plt.ylabel('comments')
plt.xlabel('likes')
plt.title('Comments vs Likes')
plt.show()

#### 5.4. Word cloud

##### Word count task

Let's see the power of [Spark's RDDs](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.html):

In [None]:
# we need RDD for the `word count` task
rdd_posts = sdf_posts.limit(1000).rdd

In [None]:
# first 5 records
rdd_posts.take(5)

In [None]:
# get text field from RDD
rdd_posts.take(5)[4].items_text

In [None]:
# take only posts' texts
rdd_posts.map(lambda x: x.items_text).take(5)

In [None]:
# convert to collection of words
rdd_posts.map(lambda x: x.items_text) \
    .flatMap(lambda line: line.split(' ')) \
    .take(5)

In [None]:
# `map` function (like MapReduce algorithm)
rdd_posts.map(lambda x: x.items_text) \
    .flatMap(lambda line: line.split(' ')) \
    .map(lambda word: (word, 1)) \
    .take(5)

In [None]:
# ...and `reduce` function too
rdd_posts.map(lambda x: x.items_text) \
    .flatMap(lambda line: line.split(' ')) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .take(5)

In [None]:
# sort to find more frequent words
rdd_posts.map(lambda x: x.items_text) \
    .flatMap(lambda line: line.split(' ')) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy( lambda x: x[1] , ascending=False) \
    .take(5)

In [None]:
# final solution with no short words
rdd_posts = rdd_posts.map(lambda x: x.items_text) \
    .flatMap(lambda line: line.split(' ')) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .sortBy( lambda x: x[1] , ascending=False) \
    .filter(lambda x: len(x[0]) > 3)
rdd_posts.take(10)

##### Resulting diagram

In [None]:
result = rdd_posts.collect()
result

In [None]:
freqs = {x[0]: x[1] for x in result}
freqs

In [None]:
freqs_bar = {k: v for k, v in freqs.items() if 50 <= v <= 1000}
freqs_bar

In [None]:
plt.figure(figsize=(18, 6))
plt.bar(*zip(*freqs_bar.items()))
plt.xticks(rotation='vertical')
plt.show()

### 6. Home assignment (optionally)

Your home assignment for this part is to repeat all the steps but with new data e.g. from the wall of any group. 

As an example you can take data like:

In [None]:
file_mask = f's3a://{VK_DATA_BUCKET}/gsom_ma-2022-11-14-05-00-20-220713/walls/wall_owner_id_*.json'
sdf = spark.read.json(file_mask)

In [None]:
sdf.printSchema()

In [None]:
sdf.count()