## Developing a data analysis routine

### Exploratory Data Analysis (EDA)

In [2]:
import numpy as np 
import pandas as pd

# Read in the dataset, and view a sample of rows with
# the .sample method
college = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college.csv")

In [3]:
college.sample(random_state=42)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3649,Career Point College,San Antonio,TX,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0,0.0,1,0.9172,0.9172,0.697,20700,14977


In [4]:
# Get the dimensions of the DataFrame with the 
# .shape attribute:
college.shape

(7535, 27)

In [7]:
# List the data type of each column, the number of
# non-missing values, and memory usage with the
# .info method:
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

In [12]:
# Get summary statistics for the numerical columns
# and transpose the DataFrame for 
# more readable output:
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [15]:
# Get summary statistics for the object (string)
# columns:
college.describe(include=[np.object]).T 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  college.describe(include=[np.object]).T


Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama A & M University,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [18]:
# It is possible to specify the exact quantiles 
# returned from the .describe method when used
# with numeric columns:
college.describe(include=[np.number],
    percentiles=[
        0.01,
        0.05,
        0.10,
        0.25,
        0.5,
        0.75,
        0.9,
        0.95,
        0.99,
]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,390.0,430.0,447.4,475.0,510.0,555.0,605.0,665.0,730.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,395.0,430.0,453.0,482.0,520.0,565.0,630.0,685.0,745.25,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,14.0,31.65,49.0,117.0,412.5,1929.5,6512.3,11858.05,26015.29,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.0,0.013265,0.06879,0.2675,0.5557,0.747875,0.86297,0.927315,1.0,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.0,0.0,0.00753,0.036125,0.10005,0.2577,0.51571,0.726715,0.961467,1.0


## Data dictionaries

In [19]:
# A data dictionary for the college dataset is
# provided in the following college_data_
# dictionary.csv file:
pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college_data_dictionary.csv")

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or University
4,MENONLY,0/1 Men Only
5,WOMENONLY,0/1 Women only
6,RELAFFIL,0/1 Religious Affiliation
7,SATVRMID,SAT Verbal Median
8,SATMTMID,SAT Math Median
9,DISTANCEONLY,Distance Education Only


### Reducing memory by changing data types

In [20]:
college = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college.csv",  )

In [21]:
different_cols = [
    "RELAFFIL",
    "SATMTMID",
    "CURROPER",
    "INSTNM",
    "STABBR",
]

col2 = college.loc[:, different_cols]

In [23]:
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [24]:
# inspect the data types of each column:
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [25]:
# Find the memory usage of each column with the 
# .memory_usage method:
original_num = col2.memory_usage(deep=True)

In [26]:
original_num

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

In [27]:
# There is no need to use 64 bits for the RELAFFIL
# column as it contains only 0 or 1. Let's convert
# this column to an 8-bit (1 byte) integer with the
# .astype method:
col2["RELAFFIL"] = col2["RELAFFIL"].astype(np.int8)

In [29]:
col2.dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [30]:
# Find the memory usage of each column again and 
# note the large reduction:
col2.memory_usage(deep=True)

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

In [33]:
# To save even more memory, you will want to
# consider changing object data types to
# categorical if they have a reasonably low
# cardinality (number of unique values). Let's first
# check the number of unique values for both the
# object columns:

col2.select_dtypes(include=["object"]).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [35]:
# The STABBR column is a good candidate to convert 
# to categorical as less than one percent of its
# values are unique:
col2["STABBR"] =col2["STABBR"].astype('category')

In [36]:
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [37]:
# compute the memory usage again
new_mem = col2.memory_usage(deep=True)

In [38]:
new_mem

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR       13120
dtype: int64

In [39]:
# Finally, let's compare the original memory usage
# with our updated memory usage. The RELAFFIL
# column is, as expected, an eighth of its original
# size, while the STABBR column has shrunk to just 
# three percent of its original size:
new_mem / original_num

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000695
STABBR      0.029512
dtype: float64

In [51]:
college.loc[0, "CURROPER"] = 10000000 

In [52]:
college.loc[0, "INSTNM"] = (
    college.loc[0, "INSTNM"] + "a"
)

In [53]:
college[["CURROPER", "INSTNM"]].memory_usage(deep=True)

Index          128
CURROPER     60280
INSTNM      660700
dtype: int64

In [54]:
college["MENONLY"].dtype

dtype('float64')

In [59]:
college["MENONLY"].astype(np.int8)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [61]:
# The type strings can also be used in combination
# with the .astype method:
college.assign(
    MENONLY=college["MENONLY"].astype("float16"),
    RELAFFIL=college["RELAFFIL"].astype("int8")
)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M Universitya,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0000,0.0059,0.0138,0.0656,10000000,0.7356,0.8284,0.1049,30300,33888
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.0100,0.2607,1,0.3460,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0000,0.0000,0.2715,0.4536,1,0.6801,0.7795,0.8540,40100,23370
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.0350,0.2146,1,0.3072,0.4596,0.2640,45500,24097
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.1270,26600,33118.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7530,SAE Institute of Technology San Francisco,Emeryville,CA,,,,1,,,,...,,,,,1,,,,,9500
7531,Rasmussen College - Overland Park,Overland Park,KS,,,,1,,,,...,,,,,1,,,,,21163
7532,National Personal Training Institute of Cleveland,Highland Heights,OH,,,,1,,,,...,,,,,1,,,,,6333
7533,Bay Area Medical Academy - San Jose Satellite ...,San Jose,CA,,,,1,,,,...,,,,,1,,,,,PrivacySuppressed


In [62]:
# Lastly, it is possible to see the enormous memory
# difference between the minimal RangeIndex and
# Int64Index, which stores every row index in memory:

college.index = pd.Int64Index(college.index)

In [63]:
college.index.memory_usage()  # Previously was just 80

60280

### Selecting the smallest of the largest

In [65]:
# Read in the movie dataset, and select the columns:
# movie_title, imdb_score, and budget:
movie = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/movie.csv")

In [66]:
movie2 = movie[["movie_title", "imdb_score", "budget"]]

In [67]:
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [68]:
# Use the .nlargest method to select the top 100
# movies by imdb_score:
movie2.nlargest(100, "imdb_score").head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [69]:
# Chain the .nsmallest method to return the five
# lowest budget films among those with a top 100 
# score
(
    movie2.nlargest(100, "imdb_score").nsmallest(
        5, "budget"
    )
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


### Selecting the largest of each group by sorting

In [70]:
# Read in the movie dataset and slim it down to just
# the three columns we care about:
# movie_title, title_year, and imdb_score:
movie = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/movie.csv")

In [71]:
movie[["movie_title", "title_year", "imdb_score"]]

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates of the Caribbean: At World's End,2007.0,7.1
2,Spectre,2015.0,6.8
3,The Dark Knight Rises,2012.0,8.5
4,Star Wars: Episode VII - The Force Awakens,,7.1
...,...,...,...
4911,Signed Sealed Delivered,2013.0,7.7
4912,The Following,,7.5
4913,A Plague So Pleasant,2013.0,6.3
4914,Shanghai Calling,2012.0,6.3


In [72]:
# Use the .sort_values method to sort the DataFrame
# by title_year. The default behavior sorts from 
# the smallest to the largest. 
# Use the ascending=True parameter
# to invert this behavior:
(
    movie[
        ["movie_title", "title_year", "imdb_score"]
    ].sort_values("title_year")
)

Unnamed: 0,movie_title,title_year,imdb_score
4695,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0
4833,Over the Hill to the Poorhouse,1920.0,4.8
4767,The Big Parade,1925.0,8.3
2694,Metropolis,1927.0,8.3
4697,The Broadway Melody,1929.0,6.3
...,...,...,...
4683,Heroes,,7.7
4688,Home Movies,,8.2
4704,Revolution,,6.7
4752,Happy Valley,,8.5


In [75]:
(
    movie[
        ["movie_title", "title_year", "imdb_score"]
    ].sort_values(["title_year", "imdb_score"], ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
4277,A Beginner's Guide to Snuff,2016.0,8.7
3798,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2
...,...,...,...
1391,Rush Hour,,5.8
4031,Creature,,5.0
2165,Meet the Browns,,3.5
3246,The Bold and the Beautiful,,3.5


In [78]:
# Now, we use the .drop_duplicates method to keep 
# only the first row of every year:
(
    movie[
        ["movie_title", "title_year", "imdb_score"]
    ].sort_values(["title_year", "imdb_score"], ascending=False)
    .drop_duplicates(subset="title_year")
)


Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
2694,Metropolis,1927.0,8.3
4767,The Big Parade,1925.0,8.3
4833,Over the Hill to the Poorhouse,1920.0,4.8
4695,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0


In [82]:
# As in most things pandas, there is more than one 
# way to do this. If you find yourself comfortable 
# with grouping operations, you can use the .groupby
# method to do this as well:

(
    movie[["movie_title", "title_year", "imdb_score"]]
    .groupby("title_year", as_index=False)
    .apply(lambda df: df.sort_values("imdb_score", ascending=False)
           .head(1)
          )
).droplevel(0).sort_values(["title_year", "imdb_score"], ascending=False)


Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
4555,Pandora's Box,1929.0,8.0
2694,Metropolis,1927.0,8.3
4767,The Big Parade,1925.0,8.3
4833,Over the Hill to the Poorhouse,1920.0,4.8


In [90]:
(
    movie[
        ["movie_title",
           "title_year", 
           "content_rating",
           "budget"
        ]
    ]
    .sort_values(
        [
            "title_year",
            "content_rating",
            "budget",
        ], ascending=False
    ).drop_duplicates(
        subset=["title_year", "content_rating"]
    )
)
# By default, .drop_duplicates keeps the very first
# appearance of a value, but this behavior may be
# modified by passing keep='last    

Unnamed: 0,movie_title,title_year,content_rating,budget
754,London Has Fallen,2016.0,R,60000000.0
10,Batman v Superman: Dawn of Justice,2016.0,PG-13,250000000.0
79,The Jungle Book,2016.0,PG,175000000.0
3252,The Wailing,2016.0,Not Rated,
801,Xi you ji zhi: Sun Wukong san da Baigu Jing,2016.0,,68005000.0
...,...,...,...,...
2558,Lilyhammer,,TV-MA,34000000.0
807,"Sabrina, the Teenage Witch",,TV-G,3000000.0
3424,Buffy the Vampire Slayer,,TV-14,2300000.0
2436,Carlos,,Not Rated,


### Replicating nlargest with sort_values

In [110]:
(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget", 
        ]
    ].nlargest(100, "imdb_score")
    .nsmallest(5, "budget")
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [105]:
(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget", 
        ]
    ].nlargest(100, "imdb_score")
    .nsmallest(5, "budget")
).drop_duplicates(subset=["imdb_score", "budget"])

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0


In [100]:
# Use .sort_values to replicate the first part of 
# the expression and grab the first 100
# rows with the .head method:

(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget",
        ]
    ]
    .sort_values(["imdb_score"], ascending=False)
    .head(100)
)

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0
...,...,...,...
3799,Anne of Green Gables,8.4,
3777,Requiem for a Dream,8.4,4500000.0
3935,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4636,The Other Dream Team,8.4,500000.0


In [109]:
# Now that we have the top 100 scoring movies, we
# can use .sort_values with
# .head again to grab the lowest five by budget:

(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget",
        ]
    ]
    .sort_values(["imdb_score"], ascending=False)
    .head(100)
    #.sort_values("budget").head(5)
).sort_values("budget").head(5)

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charlie Brown Christmas,8.4,150000.0
4801,Children of Heaven,8.5,180000.0
4804,Butterfly Girl,8.7,180000.0
4706,12 Angry Men,8.9,350000.0
4636,The Other Dream Team,8.4,500000.0


In [113]:
(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget", 
        ]
    ].nlargest(100, "imdb_score")
).tail()

Unnamed: 0,movie_title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill a Mockingbird,8.4,2000000.0
4395,Reservoir Dogs,8.4,1200000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [123]:
(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget", 
        ]
    ].sort_values("imdb_score", ascending=False,)
    .head(100)
    .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
3799,Anne of Green Gables,8.4,
3777,Requiem for a Dream,8.4,4500000.0
3935,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4636,The Other Dream Team,8.4,500000.0
2455,Aliens,8.4,18500000.0


In [126]:
(
    movie[
        [
            "movie_title",
            "imdb_score",
            "budget", 
        ]
    ].sort_values("imdb_score", ascending=False, kind='mergsort')
    .head(100)
    .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill a Mockingbird,8.4,2000000.0
4395,Reservoir Dogs,8.4,1200000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


### Calculating a trailing stop order price 

In [129]:
# To get started, we will work with Tesla Motors
# (TSLA) stock and presume a purchase
# on the first trading day of 2017:

import datetime
import pandas_datareader.data as web
import requests_cache

In [131]:
session = requests_cache.CachedSession(
    cache_name="cache",
    backend="sqlite",
    expire_after=datetime.timedelta(days=90),
)

In [132]:
tsla = web.DataReader(
    "tsla",
    data_source="yahoo",
    start="2017-1-1",
    session=session,


)

RemoteDataError: Unable to read URL: https://finance.yahoo.com/quote/tsla/history?period1=1483239600&period2=1646708399&interval=1d&frequency=1d&filter=history
Response Text:
b'<!DOCTYPE html>\n  <html lang="en-us"><head>\n  <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n      <meta charset="utf-8">\n      <title>Yahoo</title>\n      <meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">\n      <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n      <style>\n  html {\n      height: 100%;\n  }\n  body {\n      background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;\n      background-size: cover;\n      height: 100%;\n      text-align: center;\n      font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;\n  }\n  table {\n      height: 100%;\n      width: 100%;\n      table-layout: fixed;\n      border-collapse: collapse;\n      border-spacing: 0;\n      border: none;\n  }\n  h1 {\n      font-size: 42px;\n      font-weight: 400;\n      color: #400090;\n  }\n  p {\n      color: #1A1A1A;\n  }\n  #message-1 {\n      font-weight: bold;\n      margin: 0;\n  }\n  #message-2 {\n      display: inline-block;\n      *display: inline;\n      zoom: 1;\n      max-width: 17em;\n      _width: 17em;\n  }\n      </style>\n  <script>\n    document.write(\'<img src="//geo.yahoo.com/b?s=1197757129&t=\'+new Date().getTime()+\'&src=aws&err_url=\'+encodeURIComponent(document.URL)+\'&err=%<pssc>&test=\'+encodeURIComponent(\'%<{Bucket}cqh[:200]>\')+\'" width="0px" height="0px"/>\');var beacon = new Image();beacon.src="//bcn.fp.yahoo.com/p?s=1197757129&t="+new Date().getTime()+"&src=aws&err_url="+encodeURIComponent(document.URL)+"&err=%<pssc>&test="+encodeURIComponent(\'%<{Bucket}cqh[:200]>\');\n  </script>\n  </head>\n  <body>\n  <!-- status code : 404 -->\n  <!-- Not Found on Server -->\n  <table>\n  <tbody><tr>\n      <td>\n      <img src="https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_205x58_frontpage.png" alt="Yahoo Logo">\n      <h1 style="margin-top:20px;">Will be right back...</h1>\n      <p id="message-1">Thank you for your patience.</p>\n      <p id="message-2">Our engineers are working quickly to resolve the issue.</p>\n      </td>\n  </tr>\n  </tbody></table>\n  </body></html>'