__AUTHORS__:
  - Théo Perinet (22172 - theo.perinet)
  - Mathieu Rivier (23553 - mathieu.rivier)
  - Marc Monteil (23742 - marc.monteil)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#BigData---Final-Project" data-toc-modified-id="BigData---Final-Project-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>BigData - Final Project</a></span><ul class="toc-item"><li><span><a href="#Loading-The-Data" data-toc-modified-id="Loading-The-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Loading The Data</a></span></li><li><span><a href="#Exploring-The-Data" data-toc-modified-id="Exploring-The-Data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Exploring The Data</a></span></li><li><span><a href="#Analysis" data-toc-modified-id="Analysis-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Analysis</a></span></li></ul></li></ul></div>

# BigData - Final Project

###### To Use when you are on google collab
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz  
!tar xf spark-3.2.1-bin-hadoop2.7.tgz
!pip install -q findspark

###### TO USE WHEN YOU ARE ON GOOGLE COLLAB
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"
import findspark
findspark.init()

from google.colab import drive
drive.mount('/content/drive')

## Loading The Data

In [54]:
from pyspark.sql import SparkSession

In [55]:
spark_application_name = "WannaFlop_Project"

In [56]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

## Exploring The Data

In [57]:
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType, StructType,StructField
from pyspark.sql.functions import desc
import pyspark.sql.functions as func

In [58]:
class read_info(object):
    def __init__(self, file_path, header=False, delimiter=';', schema=None):
        self.file_path = file_path
        self.header = header
        self.delimiter = delimiter
        self.schema = schema

        self.df = self._load_df()

        #self.df_abstract = self._get_df_abstract()

    def __repr__(self):
        return f"{self._nb_rows()} \n{self.df.printSchema()} \n{self.get_df_abstract()}\n {self.show_missing()}\n{self._get_stats()}"

    def show_missing(self):
        print("Missing Data per column:")
        self._count_missing().show()

    def _get_num_cols(self):
        num_cols = [
            f.name for f in self.df.schema.fields
            if isinstance(f.dataType, DoubleType) or
            isinstance(f.dataType, IntegerType)
        ]
        
        return num_cols
    def _get_rounded_df(self):
        rounded_df = self.df
        dbl_cols = self._get_num_cols()
        for col in dbl_cols:
            rounded_df = rounded_df.withColumn(col, func.round('high'))

        return rounded_df

    def get_df_abstract(self):
        rounded_df = self._get_rounded_df()

        # First 40 rows
        print("First 40 rows:")
        rounded_df.show(40)

        # Last 40 rows
        print("Last 40 rows:")
        rounded_df = rounded_df.withColumn(
            "index", monotonically_increasing_id()
        )
        rounded_df.orderBy(desc("index")).drop("index").show(40)

    def _get_periodicity(self):
        self.df['data'][0]

    def _nb_rows(self):
        # Number of total rows
        print("Number of rows: " + str(self.df.count()) + "\n")

    def _handle_csv(self):
        '''
        @description: Read the csv file and return a Spark DataFrame

        @arg csv_file_path: Path to the csv file
        @arg header: boolean whether to load a header or not
        @arg delimiter: which delimiter to use by default
        '''
        return spark.read.option("inferSchema", "true").option("nullValue", "null").csv(
            self.file_path,
            sep=self.delimiter,
            schema=self.schema,
            header=self.header,
        )
    
    def _handle_json(self):
        return spark.read.json(self.file_path)

    def _load_df(self):
        ####### ADD TRY CATCH #####
        extension = self.file_path.split(".")[-1]

        df = None
        if extension == 'json':
            df = self._handle_json()
        elif extension == 'csv':
            df = self._handle_csv()

        return df

    def _count_missing(self):
        cols = self.df.columns
        cols.remove('Date')
        return self.df.select(
            [
                count(when(isnan(c) | col(c).isNull(), c)).alias(c)
                for c in cols
            ]
        )
        #.show()
        
    def _get_stats(self):
        self.df.summary().show()

In [59]:
print(h)
h.remove('Date')
print(h)

NameError: name 'h' is not defined

In [60]:
amzn_schema = StructType([
    StructField('Date', DateType(), True),
    StructField('High', DoubleType(), True),
    StructField('Low', DoubleType(), True),
    StructField('Open', DoubleType(), True),
    StructField('Close', DoubleType(), True),
    StructField('Volume', IntegerType(), True),
    StructField('Adj Close', DoubleType(), True),
    StructField('company_name', StringType(), True)
])

In [61]:
AMZN = read_info('stocks_data/AMAZON.csv', header=True, delimiter=',', schema=amzn_schema)

In [62]:
print(AMZN)

Number of rows: 987

root
 |-- Date: date (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- company_name: string (nullable = true)

First 40 rows:
+----------+-----+-----+-----+-----+------+---------+------------+
|      Date| High|  Low| Open|Close|Volume|Adj Close|company_name|
+----------+-----+-----+-----+-----+------+---------+------------+
|2017-01-03|759.0|759.0|759.0|759.0| 759.0|    759.0|      AMAZON|
|2017-01-04|760.0|760.0|760.0|760.0| 760.0|    760.0|      AMAZON|
|2017-01-05|782.0|782.0|782.0|782.0| 782.0|    782.0|      AMAZON|
|2017-01-06|799.0|799.0|799.0|799.0| 799.0|    799.0|      AMAZON|
|2017-01-09|802.0|802.0|802.0|802.0| 802.0|    802.0|      AMAZON|
|2017-01-10|798.0|798.0|798.0|798.0| 798.0|    798.0|      AMAZON|
|2017-01-11|800.0|800.0|800.0|800.0| 800.0|    800.0|

In [63]:
##### A FAIRE !!!! UN SCHEMA !!!!!

In [64]:
AMZN.get_df_abstract()

First 40 rows:
+----------+-----+-----+-----+-----+------+---------+------------+
|      Date| High|  Low| Open|Close|Volume|Adj Close|company_name|
+----------+-----+-----+-----+-----+------+---------+------------+
|2017-01-03|759.0|759.0|759.0|759.0| 759.0|    759.0|      AMAZON|
|2017-01-04|760.0|760.0|760.0|760.0| 760.0|    760.0|      AMAZON|
|2017-01-05|782.0|782.0|782.0|782.0| 782.0|    782.0|      AMAZON|
|2017-01-06|799.0|799.0|799.0|799.0| 799.0|    799.0|      AMAZON|
|2017-01-09|802.0|802.0|802.0|802.0| 802.0|    802.0|      AMAZON|
|2017-01-10|798.0|798.0|798.0|798.0| 798.0|    798.0|      AMAZON|
|2017-01-11|800.0|800.0|800.0|800.0| 800.0|    800.0|      AMAZON|
|2017-01-12|814.0|814.0|814.0|814.0| 814.0|    814.0|      AMAZON|
|2017-01-13|822.0|822.0|822.0|822.0| 822.0|    822.0|      AMAZON|
|2017-01-17|816.0|816.0|816.0|816.0| 816.0|    816.0|      AMAZON|
|2017-01-18|812.0|812.0|812.0|812.0| 812.0|    812.0|      AMAZON|
|2017-01-19|814.0|814.0|814.0|814.0| 814.0|    

In [65]:
AMZN.show_missing()

Missing Data per column:
+----+---+----+-----+------+---------+------------+
|High|Low|Open|Close|Volume|Adj Close|company_name|
+----+---+----+-----+------+---------+------------+
|   0|  0|   0|    0|     0|        0|           0|
+----+---+----+-----+------+---------+------------+



In [66]:
AMZN._get_stats()

+-------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------+
|summary|              High|               Low|             Open|             Close|           Volume|         Adj Close|company_name|
+-------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------+
|  count|               987|               987|              987|               987|              987|               987|         987|
|   mean|1762.0071216958152|1722.1011452099956|1743.433881363487|1742.9566644206718| 4509728.05775076|1742.9566644206718|        null|
| stddev| 667.2385315752688| 644.7988093382758|657.1153070927137| 655.9576061129322|2179817.628631287| 655.9576061129322|        null|
|    min|  758.760009765625| 747.7000122070312|757.9199829101562| 753.6699829101562|           881300| 753.6699829101562|      AMAZON|
|    25%|            1191.0|            1176.0|1188.300

In [87]:
AMZN.df.withColumn("test", 
              func.datediff(AMZN.df["date"][0], AMZN.df["date"][1])).show()

AnalysisException: Can't extract value from date#7653: need struct type but got date

In [88]:
AMZN.df["Date"].getItem(2)

Column<'Date[2]'>

In [89]:
AMZN.df.first()['Date']

datetime.date(2017, 1, 3)

In [90]:
AMZN.df.__get_item(0)

AttributeError: 'DataFrame' object has no attribute '__get_item'

In [91]:
AMZN.df.second()['Date']

AttributeError: 'DataFrame' object has no attribute 'second'

In [92]:
func.getrows(AMZN.df, rownums=[0, 2]).collect()

AttributeError: module 'pyspark.sql.functions' has no attribute 'getrows'

In [93]:
AMZN.df[0].__getitem__("Date").first()

TypeError: 'Column' object is not callable

In [94]:
AMZN.df[0]

Column<'Date'>

In [95]:
AMZN.df.select('Date').show()

+----------+
|      Date|
+----------+
|2017-01-03|
|2017-01-04|
|2017-01-05|
|2017-01-06|
|2017-01-09|
|2017-01-10|
|2017-01-11|
|2017-01-12|
|2017-01-13|
|2017-01-17|
|2017-01-18|
|2017-01-19|
|2017-01-20|
|2017-01-23|
|2017-01-24|
|2017-01-25|
|2017-01-26|
|2017-01-27|
|2017-01-30|
|2017-01-31|
+----------+
only showing top 20 rows



In [96]:
AMZN.df


DataFrame[Date: date, High: double, Low: double, Open: double, Close: double, Volume: int, Adj Close: double, company_name: string]

In [97]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window



my_window = Window.partitionBy().orderBy("Date")

df = AMZN.df.withColumn("prev_value", F.lag(AMZN.df.Date).over(my_window))
df = df.withColumn("diff", F.when(F.isnull(F.datediff(df.Date, df.prev_value)), 0)
                              .otherwise(F.datediff(df.Date, df.prev_value)))

In [98]:
df.select("diff").show()

+----+
|diff|
+----+
|   0|
|   1|
|   1|
|   1|
|   3|
|   1|
|   1|
|   1|
|   1|
|   4|
|   1|
|   1|
|   1|
|   3|
|   1|
|   1|
|   1|
|   1|
|   3|
|   1|
+----+
only showing top 20 rows



22/05/21 21:26:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/21 21:26:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/21 21:26:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [99]:
from pyspark.sql.functions import mean

In [100]:
df.select(mean('diff')).first()[0]

22/05/21 21:26:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/21 21:26:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


1.447821681864235

In [101]:
AMZN.df.stat.corr('High', 'Low')

0.999196080434689

TODO: Create function to compute per month week year

In [82]:
AMZN.df.select(mean ("Close")).first()[0]

1742.9566644206718

In [83]:
def get_col_mean(df, col):
    return df.select(mean (col)).first()[0]

In [84]:
get_col_mean(AMZN.df, "Close")

1742.9566644206718

In [85]:
AMZN.df.groupBy(func.weekofyear("day").alias("date_by_week")).agg(sum("Date"))

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [86]:
AMZN.df.groupBy(func.weekofyear("day").alias("date_by_week")).agg(sum("Close")).orderBy("date_by_week").show()

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
AMZN.df.withColumn("Date",func.date_sub(func.next_day(col("Date"),"sunday"),7)).groupBy("Date").agg(sum("Close").cast("int").alias("Close_total")).orderBy("week_strt_day").show()


In [None]:
AMZN.df.groupBy("Date").select("Close").show()

In [None]:
AMZN.df.groupBy(func.month("Date").alias("hour")).agg(mean("Close").alias("close_mean")).show()

In [None]:
AMZN.df.groupBy(func.year("Date").alias("hour")).agg(mean("Close").alias("close_mean")).show()

In [None]:
get_avg(AMZN.df, "Close", func.year)

In [None]:
get_avg(AMZN.df, "Open", func.year)

In [None]:
class Exploration(object):
    def __init__(self, df):
        self.df = df

    def get_oc_avg(self, fun):
        close = self._compute_avg(self.df, "Close", fun)
        opening = self._compute_avg(self.df, "Open", fun)

        return close.join(
            opening, opening.Open_new_time == close.Close_new_time, "inner"
        ).orderBy("Close_new_time").select(
            close.Close_new_time, close.Close_mean, opening.Open_mean
        )

    def _compute_avg(self, df, col, fun):
        return df.groupBy(fun("Date").alias(col + "_new_time")).agg(
            mean(col).alias(col + "_mean")
        )

In [None]:
exAMZN = Exploration(AMZN.df)

In [None]:
exAMZN.get_oc_avg(func.month).show()

In [None]:
exAMZN.get_oc_avg(func.year).show()

In [None]:
def get_price_change(period=None):
    df = AMZN.df
    if period:
        df= exAMZN.get_oc_avg(period)
   
    return  df.withColumn('diff', ( df['Close_mean'] - df['Open_mean'] ))

In [None]:
get_price_change(func.month).show()

In [None]:
get_price_change(func.year).show()

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
from stock import Stock

In [23]:
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType, StructType,StructField

In [24]:
amzn_schema = StructType([
    StructField('Date', DateType(), True),
    StructField('High', DoubleType(), True),
    StructField('Low', DoubleType(), True),
    StructField('Open', DoubleType(), True),
    StructField('Close', DoubleType(), True),
    StructField('Volume', DoubleType(), True),
    StructField('Adj Close', DoubleType(), True),
    StructField('company_name', StringType(), True)
])

In [51]:
AMZN = Stock('stocks_data/AMAZON.csv', header=True, delimiter=',', schema=amzn_schema)

In [52]:
AMZN.explore

root
 |-- Date: date (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- company_name: string (nullable = true)

First 40 rows:
+----------+-----+-----+-----+-----+---------+---------+------------+
|      Date| High|  Low| Open|Close|   Volume|Adj Close|company_name|
+----------+-----+-----+-----+-----+---------+---------+------------+
|2017-01-03|759.0|748.0|758.0|754.0|3521100.0|    754.0|      AMAZON|
|2017-01-04|760.0|754.0|758.0|757.0|2510500.0|    757.0|      AMAZON|
|2017-01-05|782.0|760.0|762.0|780.0|5830100.0|    780.0|      AMAZON|
|2017-01-06|799.0|778.0|782.0|796.0|5986200.0|    796.0|      AMAZON|
|2017-01-09|802.0|792.0|798.0|797.0|3446100.0|    797.0|      AMAZON|
|2017-01-10|798.0|790.0|797.0|796.0|2558400.0|    796.0|      AMAZON|
|2017-01-11|800.0|790.0|794.0|799.0|2992800.0|  


            None

            None

            None

            None

            None

In [53]:
AMZN.explore._nb_rows()

Number of rows: 987



In [8]:
AMZN.explore._print_schema()

root
 |-- Date: date (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- company_name: string (nullable = true)



In [10]:
AMZN.explore.get_df_abstract()

First 40 rows:
+----------+----+----+----+-----+---------+---------+------------+
|      Date|High| Low|Open|Close|   Volume|Adj Close|company_name|
+----------+----+----+----+-----+---------+---------+------------+
|2017-01-03|63.0|62.0|63.0| 63.0|2.06941E7|     59.0|   MICROSOFT|
|2017-01-04|63.0|62.0|62.0| 62.0|  2.134E7|     58.0|   MICROSOFT|
|2017-01-05|63.0|62.0|62.0| 62.0| 2.4876E7|     58.0|   MICROSOFT|
|2017-01-06|63.0|62.0|62.0| 63.0|1.99229E7|     59.0|   MICROSOFT|
|2017-01-09|63.0|63.0|63.0| 63.0|2.03827E7|     59.0|   MICROSOFT|
|2017-01-10|63.0|62.0|63.0| 63.0| 1.8593E7|     59.0|   MICROSOFT|
|2017-01-11|63.0|62.0|63.0| 63.0|2.15173E7|     59.0|   MICROSOFT|
|2017-01-12|63.0|62.0|63.0| 63.0|2.09682E7|     59.0|   MICROSOFT|
|2017-01-13|63.0|62.0|63.0| 63.0|1.94223E7|     59.0|   MICROSOFT|
|2017-01-17|63.0|62.0|63.0| 63.0| 2.0664E7|     59.0|   MICROSOFT|
|2017-01-18|63.0|62.0|63.0| 63.0|1.96701E7|     59.0|   MICROSOFT|
|2017-01-19|63.0|62.0|62.0| 62.0|1.84517E7|    

In [11]:
AMZN.explore.get_missing()

Missing Data per column:
+----+---+----+-----+------+---------+------------+
|High|Low|Open|Close|Volume|Adj Close|company_name|
+----+---+----+-----+------+---------+------------+
|   0|  0|   0|    0|     0|        0|           0|
+----+---+----+-----+------+---------+------------+



In [48]:
AMZN.explore.get_stats()

Stock Stats:
+-------+------+------+------+------+---------+---------+------------+
|summary|  High|   Low|  Open| Close|   Volume|Adj Close|company_name|
+-------+------+------+------+------+---------+---------+------------+
|  count| 987.0| 987.0| 987.0| 987.0|    987.0|      0.0|         987|
|   mean|1762.0|1722.0|1743.0|1743.0|4509728.0|     null|        null|
| stddev| 667.0| 645.0| 657.0| 656.0|2179818.0|     null|        null|
|    min| 759.0| 748.0| 758.0| 754.0| 881300.0|     null|      AMAZON|
|    25%|1191.0|1176.0|1188.0|1186.0|2982700.0|     null|        null|
|    50%|1757.0|1719.0|1742.0|1740.0|3925600.0|     null|        null|
|    75%|1942.0|1900.0|1923.0|1918.0|5429100.0|     null|        null|
|    max|3552.0|3487.0|3547.0|3531.0| 1.6565E7|     null|      AMAZON|
+-------+------+------+------+------+---------+---------+------------+



## Analysis

In [13]:
AMZN.analysis.get_oc_avg("day").show()

+----------+------------------+------------------+
|      Date|        Close_mean|         Open_mean|
+----------+------------------+------------------+
|2017-01-03| 62.58000183105469|62.790000915527344|
|2017-01-04| 62.29999923706055| 62.47999954223633|
|2017-01-05| 62.29999923706055|62.189998626708984|
|2017-01-06| 62.84000015258789| 62.29999923706055|
|2017-01-09| 62.63999938964844|  62.7599983215332|
|2017-01-10|62.619998931884766| 62.72999954223633|
|2017-01-11|63.189998626708984| 62.61000061035156|
|2017-01-12| 62.61000061035156|63.060001373291016|
|2017-01-13| 62.70000076293945|62.619998931884766|
|2017-01-17|62.529998779296875| 62.68000030517578|
|2017-01-18|              62.5| 62.66999816894531|
|2017-01-19| 62.29999923706055|  62.2400016784668|
|2017-01-20|  62.7400016784668| 62.66999816894531|
|2017-01-23|62.959999084472656| 62.70000076293945|
|2017-01-24| 63.52000045776367| 63.20000076293945|
|2017-01-25| 63.68000030517578| 63.95000076293945|
|2017-01-26|  64.2699966430664|

In [14]:
AMZN.analysis.get_oc_avg("year").show()

+----+------------------+------------------+
|Date|        Close_mean|         Open_mean|
+----+------------------+------------------+
|2017| 71.98402421502954| 71.95430287516925|
|2018|101.03398411967365|101.12235092831799|
|2019|130.38202400813026|130.33904787093874|
|2020| 190.8616180419922|190.76480678836674|
+----+------------------+------------------+



In [15]:
AMZN.analysis.get_price_change("month").show()

+-------+------------------+------------------+--------------------+
|   Date|        Close_mean|         Open_mean|                diff|
+-------+------------------+------------------+--------------------+
|2017-01| 63.19199962615967|63.185500144958496|0.006499481201174717|
|2017-02| 64.11368440326892| 64.13473711515728|  -0.021052711888359|
|2017-03| 64.84130494490914| 64.76434906669284| 0.07695587821629601|
|2017-04| 66.17157946134868| 66.23894781815379|-0.06736835680510467|
|2017-05| 68.91727308793502| 68.82818222045898| 0.08909086747603112|
|2017-06|  70.5181815407493| 70.56181820956144|-0.04363666881214101|
|2017-07| 72.01050033569337| 71.84349975585937| 0.16700057983399574|
|2017-08| 72.81695755668308|  72.7156518023947| 0.10130575428837574|
|2017-09| 74.34450073242188|  74.3654998779297|-0.02099914550781...|
|2017-10| 77.93954571810636| 77.89318119395863| 0.04636452414773373|
|2017-11| 83.71761903308686| 83.64523824055989| 0.07238079252697105|
|2017-12|   84.758500289917| 84.83

In [16]:
AMZN.analysis.get_price_change("year").show()

+----+------------------+------------------+--------------------+
|Date|        Close_mean|         Open_mean|                diff|
+----+------------------+------------------+--------------------+
|2017| 71.98402421502954| 71.95430287516925|  0.0297213398602878|
|2018|101.03398411967365|101.12235092831799|-0.08836680864433788|
|2019|130.38202400813026|130.33904787093874| 0.04297613719151627|
|2020| 190.8616180419922|190.76480678836674| 0.09681125362544662|
+----+------------------+------------------+--------------------+



In [18]:
AMZN.analysis._compute_avg(AMZN.df, "Close", "month").show()

+-------+------------------+
|   Date|        Close_mean|
+-------+------------------+
|2017-01| 63.19199962615967|
|2017-02| 64.11368440326892|
|2017-03| 64.84130494490914|
|2017-04| 66.17157946134868|
|2017-05| 68.91727308793502|
|2017-06|  70.5181815407493|
|2017-07| 72.01050033569337|
|2017-08| 72.81695755668308|
|2017-09| 74.34450073242188|
|2017-10| 77.93954571810636|
|2017-11| 83.71761903308686|
|2017-12|   84.758500289917|
|2018-01| 90.07523781912667|
|2018-02| 91.36789462440892|
|2018-03| 92.89904748825799|
|2018-04| 93.21476164318267|
|2018-05| 96.98136381669478|
|2018-06|100.56190454392205|
|2018-07| 104.6385730561756|
|2018-08|108.68434740149456|
+-------+------------------+
only showing top 20 rows



In [25]:
AMZN.analysis.get_daily_return(period="month").show()

+-------+------------------+------------------+--------------------+
|   Date|        Close_mean|         Open_mean|                diff|
+-------+------------------+------------------+--------------------+
|2017-01| 63.19199962615967|63.185500144958496|0.006499481201174717|
|2017-02| 64.11368440326892| 64.13473711515728|  -0.021052711888359|
|2017-03| 64.84130494490914| 64.76434906669284| 0.07695587821629601|
|2017-04| 66.17157946134868| 66.23894781815379|-0.06736835680510467|
|2017-05| 68.91727308793502| 68.82818222045898| 0.08909086747603112|
|2017-06|  70.5181815407493| 70.56181820956144|-0.04363666881214101|
|2017-07| 72.01050033569337| 71.84349975585937| 0.16700057983399574|
|2017-08| 72.81695755668308|  72.7156518023947| 0.10130575428837574|
|2017-09| 74.34450073242188|  74.3654998779297|-0.02099914550781...|
|2017-10| 77.93954571810636| 77.89318119395863| 0.04636452414773373|
|2017-11| 83.71761903308686| 83.64523824055989| 0.07238079252697105|
|2017-12|   84.758500289917| 84.83

In [26]:
AMZN.analysis.get_daily_return(period="day").show()

+----------+------------------+------------------+--------------------+
|      Date|        Close_mean|         Open_mean|                diff|
+----------+------------------+------------------+--------------------+
|2017-01-03| 62.58000183105469|62.790000915527344|-0.20999908447265625|
|2017-01-04| 62.29999923706055| 62.47999954223633|-0.18000030517578125|
|2017-01-05| 62.29999923706055|62.189998626708984|  0.1100006103515625|
|2017-01-06| 62.84000015258789| 62.29999923706055|  0.5400009155273438|
|2017-01-09| 62.63999938964844|  62.7599983215332|-0.11999893188476562|
|2017-01-10|62.619998931884766| 62.72999954223633| -0.1100006103515625|
|2017-01-11|63.189998626708984| 62.61000061035156|  0.5799980163574219|
|2017-01-12| 62.61000061035156|63.060001373291016| -0.4500007629394531|
|2017-01-13| 62.70000076293945|62.619998931884766|  0.0800018310546875|
|2017-01-17|62.529998779296875| 62.68000030517578|-0.15000152587890625|
|2017-01-18|              62.5| 62.66999816894531| -0.1699981689

In [23]:
AMZN.analysis.get_daily_return_rate(period="month", start_price=15.4, nb_shares=100).show()

+-------+------------------+------------------+--------------------+-------------------+
|   Date|        Close_mean|         Open_mean|                diff|            daily_r|
+-------+------------------+------------------+--------------------+-------------------+
|2017-01| 63.19199962615967|63.185500144958496|0.006499481201174717|  4.220442338425141|
|2017-02| 64.11368440326892| 64.13473711515728|  -0.021052711888359|-13.670592135298051|
|2017-03| 64.84130494490914| 64.76434906669284| 0.07695587821629601|  49.97134949110131|
|2017-04| 66.17157946134868| 66.23894781815379|-0.06736835680510467|-43.745686237080946|
|2017-05| 68.91727308793502| 68.82818222045898| 0.08909086747603112| 57.851212646773455|
|2017-06|  70.5181815407493| 70.56181820956144|-0.04363666881214101|-28.335499228662993|
|2017-07| 72.01050033569337| 71.84349975585937| 0.16700057983399574| 108.44193495714009|
|2017-08| 72.81695755668308|  72.7156518023947| 0.10130575428837574|  65.78295733011412|
|2017-09| 74.34450073

In [27]:
AMZN.analysis.get_daily_return_rate(period="year", start_price=15.4, nb_shares=100).show()

+----+------------------+------------------+--------------------+------------------+
|Date|        Close_mean|         Open_mean|                diff|           daily_r|
+----+------------------+------------------+--------------------+------------------+
|2017| 71.98402421502954| 71.95430287516925|  0.0297213398602878| 19.29957133784922|
|2018|101.03398411967365|101.12235092831799|-0.08836680864433788|-57.38104457424538|
|2019|130.38202400813026|130.33904787093874| 0.04297613719151627| 27.90658259189368|
|2020| 190.8616180419922|190.76480678836674| 0.09681125362544662|62.864450406134175|
+----+------------------+------------------+--------------------+------------------+



In [None]:
import glob

glob.glob("stocks_data/*.csv")

In [39]:
from stocks import Stocks

In [40]:
stocks = Stocks(schema=amzn_schema)

In [43]:
stocks.get_max_daily_return()

{'AMAZON': 196.64013671875,
 'GOOGLE': 61.8299560546875,
 'TESLA': 53.71002197265625,
 'ZOOM': 34.470001220703125,
 'FACEBOOK': 19.910003662109375,
 'MICROSOFT': 11.330001831054688,
 'APPLE': 5.540000915527344}

In [102]:
AMZN.df.select('Date').show()

+----------+
|      Date|
+----------+
|2017-01-03|
|2017-01-04|
|2017-01-05|
|2017-01-06|
|2017-01-09|
|2017-01-10|
|2017-01-11|
|2017-01-12|
|2017-01-13|
|2017-01-17|
|2017-01-18|
|2017-01-19|
|2017-01-20|
|2017-01-23|
|2017-01-24|
|2017-01-25|
|2017-01-26|
|2017-01-27|
|2017-01-30|
|2017-01-31|
+----------+
only showing top 20 rows

