# Import Data

In [0]:
# connect so s3 bucket
# get credentials
import os

ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
SECRET_KEY = os.getenv("AWS_SECRET_KEY")
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "aida-project"
MOUNT_NAME = "data"

# mount data
try:
  dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
except:
  display(dbutils.fs.ls("/mnt/%s" % MOUNT_NAME))

path,name,size
dbfs:/mnt/data/TSV/,TSV/,0


In [0]:
display(dbutils.fs.ls("/mnt/%s/TSV" % MOUNT_NAME))

path,name,size
dbfs:/mnt/data/TSV/name.basics.tsv,name.basics.tsv,579976550
dbfs:/mnt/data/TSV/title.akas.tsv,title.akas.tsv,969441812
dbfs:/mnt/data/TSV/title.basics.tsv,title.basics.tsv,537519832
dbfs:/mnt/data/TSV/title.principals.tsv,title.principals.tsv,1622240736
dbfs:/mnt/data/TSV/title.ratings.tsv,title.ratings.tsv,16907124


In [0]:
df_names = spark.read.load("dbfs:/mnt/data/TSV/name.basics.tsv",
                           format="csv", sep="\t", inferSchema="true", header="true")
df_akas = spark.read.load("dbfs:/mnt/data/TSV/title.akas.tsv",
                           format="csv", sep="\t", inferSchema="true", header="true")
df_basics = spark.read.load("dbfs:/mnt/data/TSV/title.basics.tsv",
                           format="csv", sep="\t", inferSchema="true", header="true")
df_principals = spark.read.load("dbfs:/mnt/data/TSV/title.principals.tsv",
                           format="csv", sep="\t", inferSchema="true", header="true")
df_ratings = spark.read.load("dbfs:/mnt/data/TSV/title.ratings.tsv",
                           format="csv", sep="\t", inferSchema="true", header="true")

In [0]:
list_dfs = [df_names, df_akas, df_basics, df_principals, df_ratings]

for df in list_dfs:
  df.printSchema()

# Feature Selection & Data Cleaning

In [0]:
from pyspark.sql.functions import mean as _mean, \
                                  min as _min, \
                                  max as _max, \
                                  count as _count, \
                                  stddev as _stddev, \
                                  countDistinct, col, isnan

## Sampling

In [0]:
# until final model evaluation use sample

random_state = 42
sample_size = 0.1

train_size = 0.8
test_size = 1 - train_size

def sample(dataframe):
  df_ids = dataframe.select('tconst', 'averageRating').sample(sample_size, random_state)
  return df_ids

df_ids = sample(df_ratings)

## Train-Test Split

In [0]:
df_train, df_test = df_ids.randomSplit([train_size, test_size], random_state)

## Table: Ratings

In [0]:
def votes(dataframe):
  df_ids = dataframe.join(df_ratings, ['tconst', 'averageRating'])
  return df_ids

df_train = votes(df_train)
df_test = votes(df_test)

## Table: Principals

In [0]:
def principals(dataframe):
  df_ids = dataframe.join(df_principals.drop('job','characters'), on='tconst')\
      .groupBy('tconst', 'averageRating', 'numVotes')\
      .agg(_max('ordering').alias('principal_counts'), countDistinct('category').alias('distinct_count_categories'))
  
  return df_ids

df_train = principals(df_train)
df_test = principals(df_test)

# Write to S3

In [0]:
############### WRITE TRAINING DATA #################

df_train.drop('tconst') \
  .repartition(1) \
  .write.option("header", "false") \
  .save('/mnt/data/niy/train', format='csv')

file = dbutils.fs.ls('mnt/data/niy/train')[-1].path
dbutils.fs.cp(file, '/mnt/data/niy/train.csv')
dbutils.fs.rm(file)

In [0]:
############### WRITE TESTING DATA #################

df_test.drop('tconst') \
  .repartition(1) \
  .write.option("header", "false") \
  .save('/mnt/data/niy/test', format='csv')

file = dbutils.fs.ls('mnt/data/niy/test')[-1].path
dbutils.fs.cp(file, '/mnt/data/niy/test.csv')
dbutils.fs.rm(file)