In [77]:
# Must run this notebook from root directory in Anaconda for relative local file paths to work

Sys.time() # script start time

[1] "2018-08-10 14:40:40 AST"

# Install and attach packages

In [78]:
# See note at the end of this notebook regarding the preferred way of installing packages

In [79]:
library(tidyverse)

In [80]:
library(data.table)

In [81]:
library(stringr)

In [82]:
library(sf) # needed for mapping

In [83]:
# install.packages("tmap")
# this for some reason can only be installed via install.packages(), but only has to be done once in your environment

In [84]:
library(tmap) # needed for mapping

In [85]:
# Set default options to display more digits, useful for gps coordinates

In [86]:
options(digits=15) # set the visible number of digits to 15, useful when displaying GIS coordinates in dataframes

In [87]:
options("digits") # display the max number of digits that appear in cells of a dataframe

In [88]:
sessionInfo()

R version 3.4.3 (2017-11-30)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)

Matrix products: default

locale:
[1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
[3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
[5] LC_TIME=English_Canada.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] tmap_1.10            sf_0.5-4             data.table_1.10.4-3 
 [4] forcats_0.2.0        stringr_1.2.0        dplyr_0.7.4         
 [7] purrr_0.2.4          readr_1.1.1          tidyr_0.7.2         
[10] tibble_1.4.1         ggplot2_2.2.1        tidyverse_1.2.1     
[13] RevoUtils_10.0.8     RevoUtilsMath_10.0.1

loaded via a namespace (and not attached):
  [1] colorspace_1.3-2   deldir_0.1-14      class_7.3-14      
  [4] gdalUtils_2.0.1.7  leaflet_1.1.0      rgdal_1.2-8       
  [7] satellite_1.0.0    IRdisplay_0.4.4    base64enc_0.1-3   
 

# Main Script Begins Here

In [89]:
Sys.time() # script start time from body

[1] "2018-08-10 14:40:40 AST"

## 1. Table A

#### Number of Homeless Beds in All Emergency or Transitional Shelters Per 100,000 Population, 2017

In [90]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [91]:
# Import data

df <- fread("DataFilesCanadaProject/Table_A.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [92]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                  <fctr> AB, MB, SK, ON, BC, QC, NL, NS, NB, PI
$ TotalBeds                 <int> 4385, 947, 830, 8725, 2783, 3730, 166, 28...
$ TotalProvincialPopulation <int> 4286134, 1338109, 1163925, 14193384, 4817...
$ BedsPer100K               <int> 102, 71, 71, 61, 58, 44, 31, 30, 22, 5
$ PREABBR                   <fctr> Alta., Man., Sask., Ont., B.C., Que., N....
$ Year                      <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017,...


In [93]:
df

Province,TotalBeds,TotalProvincialPopulation,BedsPer100K,PREABBR,Year
AB,4385,4286134,102,Alta.,2017
MB,947,1338109,71,Man.,2017
SK,830,1163925,71,Sask.,2017
ON,8725,14193384,61,Ont.,2017
BC,2783,4817160,58,B.C.,2017
QC,3730,8394034,44,Que.,2017
NL,166,528817,31,N.L.,2017
NS,284,953869,30,N.S.,2017
NB,170,759655,22,N.B.,2017
PI,7,152021,5,P.E.I.,2017


In [94]:
rm(df) # remove object from memory

## 2. Table B

#### Number of Emergency Homeless Beds at Female-Only Shelters Per 100,000 Female Population, 2017

In [95]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [96]:
# Import data

df <- fread("DataFilesCanadaProject/Table_B.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [97]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                        <fctr> AB, BC, NB, NL, NS, ON, QC, SK, MB...
$ TotalEmergencyBedsForFemales    <int> 134, 285, 24, 8, 43, 1120, 286, 241...
$ TotalProvincialPopulationFemale <int> 2117001, 2427673, 384250, 268207, 4...
$ EmergencyBedsForFemalesPer100K  <int> 6, 12, 6, 3, 9, 16, 7, 42, 0, 0
$ PREABBR                         <fctr> Alta., B.C., N.B., N.L., N.S., Ont...
$ Year                            <int> 2017, 2017, 2017, 2017, 2017, 2017,...


In [98]:
df

Province,TotalEmergencyBedsForFemales,TotalProvincialPopulationFemale,EmergencyBedsForFemalesPer100K,PREABBR,Year
AB,134,2117001,6,Alta.,2017
BC,285,2427673,12,B.C.,2017
NB,24,384250,6,N.B.,2017
NL,8,268207,3,N.L.,2017
NS,43,486028,9,N.S.,2017
ON,1120,7212574,16,Ont.,2017
QC,286,4219609,7,Que.,2017
SK,241,577058,42,Sask.,2017
MB,0,671339,0,Man.,2017
PI,0,77666,0,P.E.I.,2017


In [99]:
rm(df) # remove object from memory

## 3. Table C

#### Number of Emergency Homeless Beds at Male-Only Shelters Per 100,000 Male Population, 2017

In [100]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [101]:
# Import data

df <- fread("DataFilesCanadaProject/Table_C.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [102]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                      <fctr> AB, BC, NB, NL, NS, ON, PI, QC, SK, MB
$ TotalEmergencyBedsForMales    <int> 389, 356, 45, 9, 120, 2663, 7, 971, 1...
$ TotalProvincialPopulationMale <int> 2169133, 2389487, 375405, 260610, 467...
$ EmergencyBedsForMalesPer100K  <int> 18, 15, 12, 3, 26, 38, 9, 23, 20, 0
$ PREABBR                       <fctr> Alta., B.C., N.B., N.L., N.S., Ont.,...
$ Year                          <int> 2017, 2017, 2017, 2017, 2017, 2017, 2...


In [103]:
df

Province,TotalEmergencyBedsForMales,TotalProvincialPopulationMale,EmergencyBedsForMalesPer100K,PREABBR,Year
AB,389,2169133,18,Alta.,2017
BC,356,2389487,15,B.C.,2017
NB,45,375405,12,N.B.,2017
NL,9,260610,3,N.L.,2017
NS,120,467841,26,N.S.,2017
ON,2663,6980810,38,Ont.,2017
PI,7,74355,9,P.E.I.,2017
QC,971,4174425,23,Que.,2017
SK,118,586867,20,Sask.,2017
MB,0,666770,0,Man.,2017


In [104]:
rm(df) # remove object from memory

## 4. Table D

#### Number of Emergency Homeless Beds at Non-Gender Specific Shelters Per 100,000 Population, 2017

In [105]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [106]:
# Import data

df <- fread("DataFilesCanadaProject/Table_D.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [107]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                         <fctr> AB, BC, MB, NB, NL, NS, ON, QC, S...
$ TotalEmergencyBedsForAnyGender   <int> 2659, 1338, 661, 81, 36, 45, 2844,...
$ TotalProvincialPopulation        <int> 4286134, 4817160, 1338109, 759655,...
$ EmergencyBedsForAnyGenderPer100K <int> 62, 28, 49, 11, 7, 5, 20, 8, 6, 0
$ PREABBR                          <fctr> Alta., B.C., Man., N.B., N.L., N....
$ Year                             <int> 2017, 2017, 2017, 2017, 2017, 2017...


In [108]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                         <fctr> AB, BC, MB, NB, NL, NS, ON, QC, S...
$ TotalEmergencyBedsForAnyGender   <int> 2659, 1338, 661, 81, 36, 45, 2844,...
$ TotalProvincialPopulation        <int> 4286134, 4817160, 1338109, 759655,...
$ EmergencyBedsForAnyGenderPer100K <int> 62, 28, 49, 11, 7, 5, 20, 8, 6, 0
$ PREABBR                          <fctr> Alta., B.C., Man., N.B., N.L., N....
$ Year                             <int> 2017, 2017, 2017, 2017, 2017, 2017...


In [109]:
df

Province,TotalEmergencyBedsForAnyGender,TotalProvincialPopulation,EmergencyBedsForAnyGenderPer100K,PREABBR,Year
AB,2659,4286134,62,Alta.,2017
BC,1338,4817160,28,B.C.,2017
MB,661,1338109,49,Man.,2017
NB,81,759655,11,N.B.,2017
NL,36,528817,7,N.L.,2017
NS,45,953869,5,N.S.,2017
ON,2844,14193384,20,Ont.,2017
QC,690,8394034,8,Que.,2017
SK,75,1163925,6,Sask.,2017
PI,0,152021,0,P.E.I.,2017


In [110]:
rm(df) # remove object from memory

## 5. Table E

#### Number of Transitional Homeless Beds at Female-Only Shelters Per 100,000 Female Population, 2017

In [111]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [112]:
# Import data

df <- fread("DataFilesCanadaProject/Table_E.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [113]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                          <fctr> AB, BC, MB, NB, NL, NS, ON, QC, ...
$ TotalTransitionalBedsForFemales   <int> 198, 50, 5, 20, 12, 42, 129, 246,...
$ TotalProvincialPopulationFemale   <int> 2117001, 2427673, 671339, 384250,...
$ TransitionalBedsForFemalesPer100K <int> 9, 2, 1, 5, 4, 9, 2, 6, 11, 0
$ PREABBR                           <fctr> Alta., B.C., Man., N.B., N.L., N...
$ Year                              <int> 2017, 2017, 2017, 2017, 2017, 201...


In [114]:
df

Province,TotalTransitionalBedsForFemales,TotalProvincialPopulationFemale,TransitionalBedsForFemalesPer100K,PREABBR,Year
AB,198,2117001,9,Alta.,2017
BC,50,2427673,2,B.C.,2017
MB,5,671339,1,Man.,2017
NB,20,384250,5,N.B.,2017
NL,12,268207,4,N.L.,2017
NS,42,486028,9,N.S.,2017
ON,129,7212574,2,Ont.,2017
QC,246,4219609,6,Que.,2017
SK,66,577058,11,Sask.,2017
PI,0,77666,0,P.E.I.,2017


In [115]:
rm(df) # remove object from memory

## 6. Table F

#### Number of Transitional Homeless Beds at Male-Only Shelters Per 100,000 Male Population, 2017

In [116]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [117]:
# Import data

df <- fread("DataFilesCanadaProject/Table_F.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [118]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                        <fctr> AB, BC, MB, NL, NS, ON, QC, SK, PI...
$ TotalTransitionalBedsForMales   <int> 315, 70, 80, 2, 19, 559, 539, 133, ...
$ TotalProvincialPopulationMale   <int> 2169133, 2389487, 666770, 260610, 4...
$ TransitionalBedsForMalesPer100K <int> 15, 3, 12, 1, 4, 8, 13, 23, 0, 0
$ PREABBR                         <fctr> Alta., B.C., Man., N.L., N.S., Ont...
$ Year                            <int> 2017, 2017, 2017, 2017, 2017, 2017,...


In [119]:
df

Province,TotalTransitionalBedsForMales,TotalProvincialPopulationMale,TransitionalBedsForMalesPer100K,PREABBR,Year
AB,315,2169133,15,Alta.,2017
BC,70,2389487,3,B.C.,2017
MB,80,666770,12,Man.,2017
NL,2,260610,1,N.L.,2017
NS,19,467841,4,N.S.,2017
ON,559,6980810,8,Ont.,2017
QC,539,4174425,13,Que.,2017
SK,133,586867,23,Sask.,2017
PI,0,74355,0,P.E.I.,2017
NB,0,375405,0,N.B.,2017


In [120]:
rm(df) # remove object from memory

## 7. Table G

#### Number of Transitional Homeless Beds at Non-Gender Specific Shelters Per 100,000 Population, 2017

In [121]:
# Homeless Shelters in Canada
# Source: https://open.canada.ca/data/en/dataset/7e0189e3-8595-4e62-a4e9-4fed6f265e10
# See also: http://publications.gc.ca/collections/collection_2018/edsc-esdc/Em20-91-2018-eng.pdf

# 2017 population stats for each province
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000501&pickMembers%5B0%5D=1.10&pickMembers%5B1%5D=2.1

In [122]:
# Import data

df <- fread("DataFilesCanadaProject/Table_G.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [123]:
glimpse(df)

Observations: 10
Variables: 6
$ Province                            <fctr> AB, BC, MB, NL, NS, ON, QC, SK...
$ TotalTransitionalBedsForAnyGender   <int> 690, 684, 201, 99, 15, 1410, 99...
$ TotalProvincialPopulation           <int> 4286134, 4817160, 1338109, 5288...
$ TransitionalBedsForAnyGenderPer100K <int> 16, 14, 15, 19, 2, 10, 12, 17, ...
$ PREABBR                             <fctr> Alta., B.C., Man., N.L., N.S.,...
$ Year                                <int> 2017, 2017, 2017, 2017, 2017, 2...


In [124]:
df

Province,TotalTransitionalBedsForAnyGender,TotalProvincialPopulation,TransitionalBedsForAnyGenderPer100K,PREABBR,Year
AB,690,4286134,16,Alta.,2017
BC,684,4817160,14,B.C.,2017
MB,201,1338109,15,Man.,2017
NL,99,528817,19,N.L.,2017
NS,15,953869,2,N.S.,2017
ON,1410,14193384,10,Ont.,2017
QC,998,8394034,12,Que.,2017
SK,197,1163925,17,Sask.,2017
PI,0,152021,0,P.E.I.,2017
NB,0,759655,0,N.B.,2017


In [125]:
rm(df) # remove object from memory

## 8. Table J

#### Percent of Females 12 and Older Who Have a Strong Sense of Belonging to Their Local Community, 2017

In [126]:
# Import dataset
# Canadian health characteristics, annual estimates
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310009601

In [127]:
# Import data

df <- fread("DataFilesCanadaProject/Table_J.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [128]:
glimpse(df)

Observations: 10
Variables: 2
$ SenseOfBelongingToLocalCommunityForFemales <dbl> 69.6, 72.9, 72.9, 77.9, ...
$ PREABBR                                    <fctr> Alta., B.C., Man., N.B....


In [129]:
df

SenseOfBelongingToLocalCommunityForFemales,PREABBR
69.6,Alta.
72.9,B.C.
72.9,Man.
77.9,N.B.
77.6,N.L.
72.6,N.S.
72.4,Ont.
74.9,P.E.I.
61.4,Que.
74.8,Sask.


In [130]:
rm(df) # remove object from memory

## 9. Table K

#### Percent of Males 12 and Older Who Have a Strong Sense of Belonging to Their Local Community, 2017

In [131]:
# Import dataset
# Canadian health characteristics, annual estimates
# Source: https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310009601

In [132]:
# Import data

df <- fread("DataFilesCanadaProject/Table_K.csv", 
             stringsAsFactors = TRUE, header = TRUE)

In [133]:
glimpse(df)

Observations: 10
Variables: 2
$ SenseOfBelongingToLocalCommunityForMales <dbl> 68.2, 70.4, 74.0, 76.1, 76...
$ PREABBR                                  <fctr> Alta., B.C., Man., N.B., ...


In [134]:
df

SenseOfBelongingToLocalCommunityForMales,PREABBR
68.2,Alta.
70.4,B.C.
74.0,Man.
76.1,N.B.
76.9,N.L.
70.7,N.S.
71.0,Ont.
78.1,P.E.I.
61.8,Que.
74.0,Sask.


# Main Script Ends Here

In [135]:
Sys.time() # script end time from body

[1] "2018-08-10 14:40:42 AST"

# Appendix 1 - Examples of common coding techniques

In [136]:
# Example of how to read and filter a large csv file a chunk at a time, e.g. filter 50000 rows at a time into a new dataframe
# This is very efficient, comparable to fread in speed but done in chunks to conserve memory
# I also prefer it to fread because I can filter out rows with missing values as I import the data
# and specify which columns I want to import 
# need readr library from tidyverse library


# Sys.time() # start time for reading file
# f <- function(x, pos) subset(x, 
#                             !is.na(Beds) # filter out any rows with missing values in the specified column in parenthesis
#                             & Gender == "Male") # only include rows where the specified condition is met
#                             & any other criteria for filtering
# df <- read_csv_chunked("http://www.edsc-esdc.gc.ca/ouvert-open/hps/CDHPD-OpenDataNSPL-DataSet-20170804-ENFR.csv", 
#                            DataFrameCallback$new(f), 
#                            chunk_size = 50000, # this is an adequate chunk size
#                            # col_names = TRUE # this will keep original column names if you don't provide custom names
#                                               # most of the time it's best to rename every column explicitly as shown below
#                            col_names = c("ShelterType", # this renames each column in order of the csv file to what you specify
#                                          "ProvinceCode",
#                                          "City",
#                                          "TargetPopulation",
#                                          "Gender",
#                                          "ShelterName",
#                                          "Beds"), # all other columns after this will be dropped if not included        
#                            cols_only( # specify which columns with data types you want to include 
#                                       # to avoid importing unwanted data
#                                       # include all columns that you specified above with col_names if you renamed them
#                                "ShelterType" = col_character(),
#                                "ProvinceCode" = col_character(),
#                                "City" = col_character(),
#                                "TargetPopulation" = col_character(),
#                                "Gender" = col_character(),
#                                "ShelterName" = col_character(),
#                                "Beds" = col_number()),
#                            skip = 1, # skip original column headers if you renamed them
#                            trim_ws = TRUE) # trim leading and trailing whitespace for each data point
# Sys.time() # end time for reading file

In [137]:
# Example of how to read csv files quickly with fread while converting all chr variables to factors 
# need data.table library
# Use fread when you are a lazy and know you won't have memory issues loading the entire dataset at once
# or when you know you need to keep all information loaded into memory

# df <- fread("DataFilesAOP/AB_Provincial_Detailed_Crime_Stats_2006-2011-2016.csv", 
#            stringsAsFactors = TRUE, header = TRUE)

In [138]:
# Example of how to write a dataframe to a csv file

# write.csv(df, 
#           file = "NS_Crime_Stats_Clean_Extract3.csv",
#           row.names = FALSE) # set row.name to FALSE to not include an index column in the csv output file

In [139]:
# Example how to change all of the column names in a dataframe to something friendlier
# You want to avoid special characters and spaces in column names to prevent problems with differnt packages and features
# Provide new names in a character vector

# colnames(df) <- c(" ", " ")

In [140]:
# Example how to set the number of digits displayed in dataframes

# options(digits=15) # set the visible number of digits to 15, useful when displaying GIS coordinates in dataframes
# options("digits") # display the max number of digits that appear in cells of a dataframe

In [141]:
# Example of how to convert columns to the correct data types and how to delete columns from a dataframe

# use as.numeric() and as.factor() where appropriate
# df$column_name <- NULL will delete that column from the dataframe

In [142]:
# Example of how to recode values of a column for rows meeting a condition

# df$LATITUDE[df$GEO == 
#                          "Amherst, Nova Scotia, municipal"] <- 45.8338198
# df$LONGITUDE[df$GEO == 
#                          "Amherst, Nova Scotia, municipal"] <- -64.2109455
#
# df$COUNTY[df$GEO ==
#                          "Amherst, Nova Scotia, municipal"] <- "Cumberland"

In [143]:
# Example of how to filter a dataframe to rows where a specific string appears anywhere in a specific column
# This example filters df to rows where "MEP" appears in the "Contents" column (need stringr library)

# df_MEP <- df %>%
#   filter(str_detect(Contents, "MEP"))

In [144]:
# Example of how to handle dates - converting from timestamp to YYYYMMDD
# This example is adequate if you really don't need a date field for arithmetic or plotting with ggplot2

# parse out date and time from Timestamp
# df <- 
#   separate(df, Timestamp, # e.g., "01/01/2017 12:00:00 AM"
#           into = c("Date", "Time", "AMPM"), sep = " ")

# parse out Month, Day, Year from Date
# df <- 
#  separate(df, Date, 
#           into = c("Month", "Day", "Year"), sep = "/")

# Remove unwanted columns
# df$Time <- NULL
# df$AMPM <- NULL

# unite the columns "Year", "Month", "Day" into a single column for "YYYYMMDD"
# df <- 
#  unite_(df, "YYYYMMDD", c("Year","Month", "Day"), sep = "")

# sort by date in decreasing order
# df <- df[order(df$YYYYMMDD, decreasing = TRUE), ]

In [145]:
# Use the lubridate library if arithmetic is important when working with date and time data
# In my opinion, the lubridate functions are more complicated than my current needs, so I prefer my approach above
# for handling dates/times

# https://lubridate.tidyverse.org/
# https://rawgit.com/rstudio/cheatsheets/master/lubridate.pdf

In [146]:
# Example of how to subset and retain only selected characters from a cell
# This is useful if you want to strip out useless characters in a cell given a start and end position to retain
# need stringr library

# df$gps <- 
#   str_sub(df$gps, start = 3, end = -2) 

# this gets rid of the first 3 characters and the last two characters
# you can get an accurate count of characters when viewing them with glimpse(df)

In [147]:
# Example how to merge two dataframes on common column names
# https://stackoverflow.com/questions/1299871/how-to-join-merge-data-frames-inner-outer-left-right

# CanadaMap <- merge(CanadaMap, df_Provincial) # join the two dataframes, merge will use the common column name to join with

In [148]:
# Example of how to combine two or more dataframes vertically with the same column names

# df4 <- rbind(df1, df2, df3)

In [149]:
# Example of how to combine two or more dataframes horizontally with the same number of rows but different columns

# df4 <- cbind(df1, df2, df3)

In [150]:
# Example of how to create a new column that takes on values by performing arithmetic on existing columns 
# need dplyr and tidyverse library
# This example creates a new column that divides one column by another and mutliplies the result by a constant

# NSMAP2011 <- NSMAP2011 %>%
#   mutate(PropertyCrimeCountyRatePer100K = PropertyCrimeIncidentsPerCounty/CountyPopulation * 100000)

In [151]:
# Example of how to order a dataframe by a column in descening order
# a "-" in front of the column name will sort it in descending (largest to lowest) order

# df <- 
#  df[with(df, order(-Incidents_Total)), ]


# adding multiple column names will prioritize ordering in that sequence

# df <- 
#  df[with(df, order(REGION, COUNTY, GEO, Violations, Statistics)), ]

#--------------------
# Alternatively, you can also use the following:
# sort by date in decreasing order
# df <- df[order(df$YYYYMMDD, decreasing = TRUE), ]

In [152]:
# Before spreading data for tidy format, make sure to remove any special characters beforehand
# This ensures that after spreading the data, the column names are all properly formatted
# Properly formatted column names will always appear flushed next to the dollar sign when glimpsing a dataframe
# If spaces or special characters are present in the column name, they will appear within single quotes after the dollar sign
# when glimpsing the dataframe after spreading the data

# df_health$Indicators <- gsub(" ", "_", df_health$Indicators)
# df_health$Indicators <- gsub(",", "", df_health$Indicators)
# df_health$Indicators <- gsub(";", "", df_health$Indicators)
# df_health$Indicators <- gsub("-", "_", df_health$Indicators)
# df_health$Indicators <- gsub("\\(", "", df_health$Indicators) # this removes left parenthesis
# df_health$Indicators <- gsub("\\)", "", df_health$Indicators) # this removes right parenthesis

# df_health <- df_health %>%
#   spread(Indicators, Value)

In [153]:
# Example of how to load a shapefile for mapping 
# need sf library

# NS <- st_read("GIS_Nova_Scotia/Counties_GCS.shp")

In [154]:
# Example how to convert a dataframe of lats/longs to a an sf dataframe for points spatial data 
# need sf library

# pts <- st_as_sf(df, coords = c("LONGITUDE", "LATITUDE"), crs = 4269) # create sf object called "pts" from df
# class(pts)
# glimpse(pts) # LONGITUDE and LATITUDE columns are replaced with a geometry column for point data

In [155]:
# Example how to add custom icons for points spatial data 
# need sf library and tmap library

# Using custom icons for points instead of bubbles

# file <- "DataFilesGeneral/flower_960_720.png"  
# flower_icon <- tmap_icons(file, width = 48, height = 48, keep.asp = TRUE,
#   just = c("center", "center"), as.local = TRUE)

# my_map_2 <- my_map_0 +
#   tm_shape(pts) +
#   tm_symbols("VALUE", shape = flower_icon, size = .3, alpha = .2, border.alpha = 0) 
# my_map_2

# Appendix 2 - Note regarding preferred way for installing packages

In [156]:
# Preferrably, install new packages via Anaconda Prompt > conda install r-package_name
# Try to only use library() in notebook instead of install.packages()
# this is more reliable than using install.packages("package name") in R script
# Anaconda Prompt > conda list shows what packages are installed

# However, when the above isn't true for some reason, it is noted below
# e.g., tmap must be installed through install.packages("tmap")