<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#BigData---Final-Project" data-toc-modified-id="BigData---Final-Project-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>BigData - Final Project</a></span><ul class="toc-item"><li><span><a href="#Loading-The-Data" data-toc-modified-id="Loading-The-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Loading The Data</a></span></li><li><span><a href="#Exploring-The-Data" data-toc-modified-id="Exploring-The-Data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Exploring The Data</a></span></li></ul></li></ul></div>

# BigData - Final Project

__AUTHORS__:
  - Théo Perinet (22172 - theo.perinet)
  - Mathieu Rivier (23553 - mathieu.rivier)
  - Marc Monteil (23742 - marc.monteil)

###### To Use when you are on google collab
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz  
!tar xf spark-3.2.1-bin-hadoop2.7.tgz
!pip install -q findspark

###### TO USE WHEN YOU ARE ON GOOGLE COLLAB
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"
import findspark
findspark.init()

from google.colab import drive
drive.mount('/content/drive')

## Loading The Data

In [1]:
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark_application_name = "WannaFlop_Project"

In [3]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/20 09:35:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Exploring The Data

In [4]:
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType, StructType,StructField
from pyspark.sql.functions import desc
import pyspark.sql.functions as func

In [13]:
class read_info(object):
    def __init__(self, file_path, header=False, delimiter=';', schema=None):
        self.file_path = file_path
        self.header = header
        self.delimiter = delimiter
        self.schema = schema

        self.df = self._load_df()

        #self.df_abstract = self._get_df_abstract()

    def __repr__(self):
        return f"{self._nb_rows()} \n{self.df.printSchema()} \n{self.get_df_abstract()}\n {self.show_missing()}\n{self._get_stats()}"

    def show_missing(self):
        print("Missing Data per column:")
        self._count_missing().show()

    def _get_num_cols(self):
        num_cols = [
            f.name for f in self.df.schema.fields
            if isinstance(f.dataType, DoubleType) or
            isinstance(f.dataType, IntegerType)
        ]
        
        return num_cols
    def _get_rounded_df(self):
        rounded_df = self.df
        dbl_cols = self._get_num_cols()
        for col in dbl_cols:
            rounded_df = rounded_df.withColumn(col, func.round('high'))

        return rounded_df

    def get_df_abstract(self):
        rounded_df = self._get_rounded_df()

        # First 40 rows
        print("First 40 rows:")
        rounded_df.show(40)

        # Last 40 rows
        print("Last 40 rows:")
        rounded_df = rounded_df.withColumn(
            "index", monotonically_increasing_id()
        )
        rounded_df.orderBy(desc("index")).drop("index").show(40)

    def _get_periodicity(self):
        self.df['data'][0]

    def _nb_rows(self):
        # Number of total rows
        print("Number of rows: " + str(self.df.count()) + "\n")

    def _handle_csv(self):
        '''
        @description: Read the csv file and return a Spark DataFrame

        @arg csv_file_path: Path to the csv file
        @arg header: boolean whether to load a header or not
        @arg delimiter: which delimiter to use by default
        '''
        return spark.read.option("inferSchema", "true").option("nullValue", "null").csv(
            self.file_path,
            sep=self.delimiter,
            schema=self.schema,
            header=self.header,
        )
    
    def _handle_json(self):
        return spark.read.json(self.file_path)

    def _load_df(self):
        ####### ADD TRY CATCH #####
        extension = self.file_path.split(".")[-1]

        df = None
        if extension == 'json':
            df = self._handle_json()
        elif extension == 'csv':
            df = self._handle_csv()

        return df

    def _count_missing(self):
        cols = self.df.columns
        cols.remove('Date')
        return self.df.select(
            [
                count(when(isnan(c) | col(c).isNull(), c)).alias(c)
                for c in cols
            ]
        )
        #.show()
        
    def _get_stats(self):
        self.df.summary().show()

In [14]:
print(h)
h.remove('Date')
print(h)

['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close', 'company_name']


ValueError: list.remove(x): x not in list

In [None]:
amzn_schema = StructType([
    StructField('Date', DateType(), True),
    StructField('High', DoubleType(), True),
    StructField('Low', DoubleType(), True),
    StructField('Open', DoubleType(), True),
    StructField('Close', DoubleType(), True),
    StructField('Volume', IntegerType(), True),
    StructField('Adj Close', DoubleType(), True),
    StructField('company_name', StringType(), True)
])

In [15]:
AMZN = read_info('stocks_data/AMAZON.csv', header=True, delimiter=',', schema=amzn_schema)

In [16]:
print(AMZN)

Number of rows: 987

root
 |-- Date: date (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- company_name: string (nullable = true)

First 40 rows:
+----------+-----+-----+-----+-----+------+---------+------------+
|      Date| High|  Low| Open|Close|Volume|Adj Close|company_name|
+----------+-----+-----+-----+-----+------+---------+------------+
|2017-01-03|759.0|759.0|759.0|759.0| 759.0|    759.0|      AMAZON|
|2017-01-04|760.0|760.0|760.0|760.0| 760.0|    760.0|      AMAZON|
|2017-01-05|782.0|782.0|782.0|782.0| 782.0|    782.0|      AMAZON|
|2017-01-06|799.0|799.0|799.0|799.0| 799.0|    799.0|      AMAZON|
|2017-01-09|802.0|802.0|802.0|802.0| 802.0|    802.0|      AMAZON|
|2017-01-10|798.0|798.0|798.0|798.0| 798.0|    798.0|      AMAZON|
|2017-01-11|800.0|800.0|800.0|800.0| 800.0|    800.0|

In [None]:
##### A FAIRE !!!! UN SCHEMA !!!!!

In [None]:
AMZN.get_df_abstract()

In [None]:
AMZN.show_missing()

In [None]:
AMZN._get_stats()

In [None]:
AMZN.df.withColumn("test", 
              func.datediff(AMZN.df["date"][0], AMZN.df["date"][1])).show()

In [None]:
AMZN.df["Date"].getItem(2)

In [None]:
AMZN.df.first()['Date']

In [None]:
AMZN.df.__get_item(0)

In [None]:
AMZN.df.second()['Date']

In [None]:
func.getrows(AMZN.df, rownums=[0, 2]).collect()

In [None]:
AMZN.df[0].__getitem__("Date").first()

In [None]:
AMZN.df[0]

In [None]:
AMZN.df.select('Date').show()

In [None]:
AMZN.df


In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window



my_window = Window.partitionBy().orderBy("Date")

df = AMZN.df.withColumn("prev_value", F.lag(AMZN.df.Date).over(my_window))
df = df.withColumn("diff", F.when(F.isnull(F.datediff(df.Date, df.prev_value)), 0)
                              .otherwise(F.datediff(df.Date, df.prev_value)))

In [None]:
df.select("diff").show()

In [None]:
from pyspark.sql.functions import mean

In [None]:
df.select(mean('diff')).first()[0]

In [None]:
AMZN.df.stat.corr('High', 'Low')

TODO: Create function to compute per month week year

In [None]:
AMZN.df.select(mean ("Close")).first()[0]

In [None]:
def get_col_mean(df, col):
    return df.select(mean (col)).first()[0]

In [None]:
get_col_mean(AMZN.df, "Close")

In [None]:
AMZN.df.groupBy(func.weekofyear("day").alias("date_by_week")).agg(sum("Date"))

In [None]:
AMZN.df.groupBy(func.weekofyear("day").alias("date_by_week")).agg(sum("Close")).orderBy("date_by_week").show()

In [None]:
AMZN.df.withColumn("Date",func.date_sub(func.next_day(col("Date"),"sunday"),7)).groupBy("Date").agg(sum("Close").cast("int").alias("Close_total")).orderBy("week_strt_day").show()


In [None]:
AMZN.df.groupBy("Date").select("Close").show()

In [None]:
AMZN.df.groupBy(func.month("Date").alias("hour")).agg(mean("Close").alias("close_mean")).show()

In [None]:
AMZN.df.groupBy(func.year("Date").alias("hour")).agg(mean("Close").alias("close_mean")).show()

In [None]:
def 

In [None]:
get_avg(AMZN.df, "Close", func.year)

In [None]:
get_avg(AMZN.df, "Open", func.year)

In [None]:
class Exploration(object):
    def __init__(self, df):
        self.df = df

    def get_oc_avg(self, fun):
        close = self._compute_avg(self.df, "Close", fun)
        opening = self._compute_avg(self.df, "Open", fun)

        return close.join(
            opening, opening.Open_new_time == close.Close_new_time, "inner"
        ).orderBy("Close_new_time").select(
            close.Close_new_time, close.Close_mean, opening.Open_mean
        )

    def _compute_avg(self, df, col, fun):
        return df.groupBy(fun("Date").alias(col + "_new_time")).agg(
            mean(col).alias(col + "_mean")
        )

In [None]:
exAMZN = Exploration(AMZN.df)

In [None]:
exAMZN.get_oc_avg(func.month).show()

In [None]:
exAMZN.get_oc_avg(func.year).show()

In [None]:
def get_price_change(period=None):
    df = AMZN.df
    if period:
        df= exAMZN.get_oc_avg(period)
   
    return  df.withColumn('diff', ( df['Close_mean'] - df['Open_mean'] ))

In [None]:
get_price_change(func.month).show()

In [None]:
get_price_change(func.year).show()