# Canadian Principal Crops Data

In [None]:
# Libraries
library(tidyverse)
library(RSQLite)
library(DBI)

"package 'ggplot2' was built under R version 4.5.2"
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
"package 'RSQLite' was built under R version 4.5.2"
"package 'DBI' was built under R version 4.5.2"


In this notebook, we look at an [agricultural produce dataset](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3210035901) for Canada for years 1908-2020 and a [CAD-USD exchange rates dataset](https://www.bankofcanada.ca/rates/exchange/daily-exchange-rates/).

## Construct the database and load up some data

In [2]:
# Annual crop data
acd_path <- r"(https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-RP0203EN-SkillsNetwork/labs/Practice%20Assignment/Annual_Crop_Data.csv)"

# Daily FX data
fxd_path <- r"(https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-RP0203EN-SkillsNetwork/labs/Practice%20Assignment/Daily_FX.csv)"

In [None]:
# Establish an RSQLite connection
c <- dbConnect(SQLite(), "data/FinalDB_lab4.sqlite") # ":memory:" if no permanent file is required

In [None]:
# Drop both tabs if they exist already
q <- "
DROP TABLE IF EXISTS crop_data;
"
dbExecute(c, q)

q <- "
DROP TABLE IF EXISTS daily_fx;
"
dbExecute(c, q)

In [29]:
# Load the data
acd_df <- read.csv(acd_path, colClasses = c(YEAR = "character"))
acd_df <- acd_df |> janitor::clean_names()
head(acd_df)

Unnamed: 0_level_0,cd_id,year,crop_type,geo,seeded_area,harvested_area,production,avg_yield
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>
1,0,1965-12-31,Barley,Alberta,1372000,1372000,2504000,1825
2,1,1965-12-31,Barley,Canada,2476800,2476800,4752900,1920
3,2,1965-12-31,Barley,Saskatchewan,708000,708000,1415000,2000
4,3,1965-12-31,Canola,Alberta,297400,297400,215500,725
5,4,1965-12-31,Canola,Canada,580700,580700,512600,885
6,5,1965-12-31,Canola,Saskatchewan,224600,224600,242700,1080


In [30]:
fxd_df <- read.csv(fxd_path, colClasses = c(date = "character"))
fxd_df <- fxd_df |> janitor::clean_names()
head(fxd_df)

"not all columns named in 'colClasses' exist"


Unnamed: 0_level_0,dfx_id,date,fxusdcad
Unnamed: 0_level_1,<int>,<chr>,<dbl>
1,0,2017-01-03,1.3435
2,1,2017-01-04,1.3315
3,2,2017-01-05,1.3244
4,3,2017-01-06,1.3214
5,4,2017-01-09,1.324
6,5,2017-01-10,1.3213


In [31]:
# Construct the tabs
q <- "
CREATE TABLE IF NOT EXISTS crop_data (
    cd_id INTEGER NOT NULL,
    year DATE NOT NULL,
    crop_type VARCHAR(20) NOT NULL,
    geo VARCHAR(20) NOT NULL,
    seeded_area INTEGER NOT NULL,
    harvested_area INTEGER NOT NULL,
    production INTEGER NOT NULL,
    avg_yield INTEGER NOT NULL,
    PRIMARY KEY (cd_id)
);
"

# Execute channel-query
df1 <- tryCatch(
    dbExecute(c, q),
    error = function(e) e
)

# Check whether df1 is an error object
if (inherits(df1, "error")) {
    cat("An error has occurred.\n")
    print(df1$message)
} else {
    cat("Table has been created successfully.\n")
}


Table has been created successfully.


In [32]:
q <- "
CREATE TABLE IF NOT EXISTS daily_fx (
    dfx_id INTEGER NOT NULL,
    date DATE NOT NULL,
    fxusdcad FLOAT(6),
    PRIMARY KEY (dfx_id)
);
"

df3 <- tryCatch(
    dbExecute(c, q),
    error = function(e) e
)

if (inherits(df3, "error")) {
    cat("An error has occurred.\n")
    print(df3$message)
} else {
    cat("Table has been created successfully.\n")
}

Table has been created successfully.


In [33]:
# Write the data into the database
dbWriteTable(c, "crop_data", acd_df, overwrite = TRUE, header = TRUE)

In [34]:
dbWriteTable(c, "daily_fx", fxd_df, overwrite = TRUE, header = TRUE)

In [None]:
# List the tables in the database
dbListTables(c)

In [None]:
# Check memory usage out of curiosity
#object.size(acd_df) |> format(units = "MB")
#object.size(fxd_df) |> format(units = "MB")

In [35]:
# Clear the dfs from memory
rm(acd_df, fxd_df)
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,1536114,82.1,2424501,129.5,2424501,129.5
Vcells,2723871,20.8,8388608,64.0,5272033,40.3


## Solve some practice problems

In [37]:
# Look at the number of rows in each table
q <- "
SELECT COUNT(*) FROM crop_data;
"
dbGetQuery(c, q)

COUNT(*)
<int>
672


In [38]:
q <- "
SELECT COUNT(*) FROM daily_fx;
"
dbGetQuery(c, q)

COUNT(*)
<int>
1033


In [39]:
# Query and display the first 6 rows of the crop data
q <- "
SELECT * FROM crop_data
LIMIT 6;
"
dbGetQuery(c, q)

cd_id,year,crop_type,geo,seeded_area,harvested_area,production,avg_yield
<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>
0,1965-12-31,Barley,Alberta,1372000,1372000,2504000,1825
1,1965-12-31,Barley,Canada,2476800,2476800,4752900,1920
2,1965-12-31,Barley,Saskatchewan,708000,708000,1415000,2000
3,1965-12-31,Canola,Alberta,297400,297400,215500,725
4,1965-12-31,Canola,Canada,580700,580700,512600,885
5,1965-12-31,Canola,Saskatchewan,224600,224600,242700,1080


In [41]:
# List the types of crops in the crop dataset
q <- "
SELECT DISTINCT(crop_type) FROM crop_data;
"
dbGetQuery(c, q)

crop_type
<chr>
Barley
Canola
Rye
Wheat


In [43]:
# Query and display the first 6 rows of the crop data for Rye
q <- "
SELECT * FROM crop_data
WHERE crop_type = 'Rye'
LIMIT 6;
"
dbGetQuery(c, q)

cd_id,year,crop_type,geo,seeded_area,harvested_area,production,avg_yield
<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>
6,1965-12-31,Rye,Alberta,81000,81000,116400,1435
7,1965-12-31,Rye,Canada,323900,323900,453400,1400
8,1965-12-31,Rye,Saskatchewan,166000,166000,224000,1350
18,1966-12-31,Rye,Alberta,70000,70000,109000,1555
19,1966-12-31,Rye,Canada,293400,293400,437600,1490
20,1966-12-31,Rye,Saskatchewan,161000,161000,228600,1420


In [45]:
# Which crops have had an average yield greater than or equal to 3000 KG per Hectare?
q <- "
SELECT DISTINCT(crop_type) FROM crop_data
WHERE avg_yield >= 3000;
"
dbGetQuery(c, q)

crop_type
<chr>
Barley
Wheat
Rye


In [47]:
# Find the first and last dates of each table
q <- "
SELECT MIN(year) AS first, MAX(year) AS last FROM crop_data;
"
dbGetQuery(c, q)

first,last
<chr>,<chr>
1965-12-31,2020-12-31


In [48]:
q <- "
SELECT MIN(date) AS first, MAX(date) AS last FROM daily_fx;
"
dbGetQuery(c, q)

first,last
<chr>,<chr>
2017-01-03,2021-02-18


In [None]:
# List the top 10 years of Wheat production in Saskatchewan in terms of harvested area
q <- "
-- Top 5 years for the sake of space
SELECT STRFTIME('%Y', year) AS year, production FROM crop_data -- msql: `YEAR(date)`
WHERE geo = 'Saskatchewan' AND crop_type = 'Wheat'
ORDER BY production DESC
LIMIT 5;
"
dbGetQuery(c, q)

year,production
<chr>,<int>
1991,18501200
1986,18370200
2013,18298300
1990,17485900
1996,16547000


In [58]:
# How many years did Barley yield at least 2000 KG per Hectare in Canada?
q <- "
SELECT COUNT(year) AS year_count FROM crop_data
WHERE avg_yield >= 2000 AND geo = 'Canada' AND crop_type = 'Barley';
"
dbGetQuery(c, q)

year_count
<int>
52


In [None]:
# How much farm land was seeeded with Barley in Alberta but not harvested each year since the year 2000?
q <- "
-- Last 5 years for the sake of space
SELECT
    STRFTIME('%Y', year) AS year,
    ROUND(((seeded_area - harvested_area) * 1.0 / seeded_area) * 100, 2) AS pc_lost -- Force the integers to floats with `* 1.0`
FROM crop_data
WHERE year BETWEEN 2016 AND 2021 AND geo = 'Alberta' AND crop_type = 'Barley' -- Extend the range when comparing dates with integers;
"
dbGetQuery(c, q)

year,pc_lost
<chr>,<dbl>
2016,22.08
2017,12.29
2018,11.79
2019,11.73
2020,10.5


In [None]:
# Over the last 3 calendar years of data, what was the average value of the Canadian dollar relative to the USD?
q <- "
-- Verify year seletion and calculate the annual avg
SELECT STRFTIME('%Y', date) AS year, AVG(fxusdcad) AS avg_fx FROM daily_fx
WHERE date >= (SELECT DATE(MAX(date), '-3 years') FROM daily_fx) -- msql: `DATE_SUB(date, INTERVAL 3 YEAR)`
GROUP BY STRFTIME('%Y', date);
"
dbGetQuery(c, q)

year,avg_fx
<chr>,<dbl>
2018,1.303575
2019,1.326913
2020,1.341451
2021,1.273215


In [97]:
q <- "
-- Avg for the last three years
SELECT AVG(fxusdcad) AS avg_fx FROM daily_fx
WHERE date >= (SELECT DATE(MAX(date), '-2 years') FROM daily_fx);
"
dbGetQuery(c, q)

avg_fx
<dbl>
1.330622


In [99]:
# Use an implicit inner join to create a view of the crop data with an FX column included
q <- "
SELECT c.*, f.*
FROM crop_data AS c, daily_fx AS f
WHERE c.year = f.date
LIMIT 10;
"
dbGetQuery(c, q)

cd_id,year,crop_type,geo,seeded_area,harvested_area,production,avg_yield,dfx_id,date,fxusdcad
<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<chr>,<dbl>
636,2018-12-31,Barley,Alberta,1260200,1111600,3996300,3595,498,2018-12-31,1.3642
637,2018-12-31,Barley,Canada,2627700,2395000,8379700,3499,498,2018-12-31,1.3642
638,2018-12-31,Barley,Saskatchewan,1089400,1020000,3439200,3372,498,2018-12-31,1.3642
639,2018-12-31,Canola,Alberta,2755900,2703000,5870600,2172,498,2018-12-31,1.3642
640,2018-12-31,Canola,Canada,9232200,9119700,20724000,2272,498,2018-12-31,1.3642
641,2018-12-31,Canola,Saskatchewan,4997900,4955000,11308000,2282,498,2018-12-31,1.3642
642,2018-12-31,Rye,Alberta,16000,9100,30000,3333,498,2018-12-31,1.3642
643,2018-12-31,Rye,Canada,135400,78900,236400,2995,498,2018-12-31,1.3642
644,2018-12-31,Rye,Saskatchewan,30400,21600,47400,2194,498,2018-12-31,1.3642
645,2018-12-31,Wheat,Alberta,3053100,2991300,10006100,3345,498,2018-12-31,1.3642


In [100]:
q <- "
SELECT c.*, f.*
FROM crop_data AS c
INNER JOIN daily_fx AS f
ON c.year = f.date
LIMIT 10;
"
dbGetQuery(c, q)

cd_id,year,crop_type,geo,seeded_area,harvested_area,production,avg_yield,dfx_id,date,fxusdcad
<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<chr>,<dbl>
636,2018-12-31,Barley,Alberta,1260200,1111600,3996300,3595,498,2018-12-31,1.3642
637,2018-12-31,Barley,Canada,2627700,2395000,8379700,3499,498,2018-12-31,1.3642
638,2018-12-31,Barley,Saskatchewan,1089400,1020000,3439200,3372,498,2018-12-31,1.3642
639,2018-12-31,Canola,Alberta,2755900,2703000,5870600,2172,498,2018-12-31,1.3642
640,2018-12-31,Canola,Canada,9232200,9119700,20724000,2272,498,2018-12-31,1.3642
641,2018-12-31,Canola,Saskatchewan,4997900,4955000,11308000,2282,498,2018-12-31,1.3642
642,2018-12-31,Rye,Alberta,16000,9100,30000,3333,498,2018-12-31,1.3642
643,2018-12-31,Rye,Canada,135400,78900,236400,2995,498,2018-12-31,1.3642
644,2018-12-31,Rye,Saskatchewan,30400,21600,47400,2194,498,2018-12-31,1.3642
645,2018-12-31,Wheat,Alberta,3053100,2991300,10006100,3345,498,2018-12-31,1.3642


## Disconnect

In [101]:
# Disconnect
dbDisconnect(c)