# External Factors Analysis

Copyright ©2022. Stephen Rigden. This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.

In [25]:
import pandas


from pathlib import Path

In [26]:
# Change the import_file_name to match the name of current import file.
import_file_name = 'External Factors'

# Set file paths
project_path = Path.cwd().parent.parent
extra_data_file = project_path / 'data' / 'raw' / f"{import_file_name}.csv"
extra_data_pickle = project_path / 'data' / 'processed' / 'extra_data_preprocessed.pickle'

# Get the raw data
eds = pandas.read_csv(extra_data_file)
eds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           84 non-null     object 
 1   heart_tablets  84 non-null     int64  
 2   alcohol        51 non-null     float64
 3   notes          1 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 2.8+ KB


In [27]:
eds.head()

Unnamed: 0,date,heart_tablets,alcohol,notes
0,6/1/1875,2,,A note
1,6/2/1875,2,177.0,
2,6/3/1875,2,,
3,6/4/1875,2,,
4,6/5/1875,2,30.0,


In [28]:
eds.tail()

Unnamed: 0,date,heart_tablets,alcohol,notes
79,8/19/1875,1,,
80,8/20/1875,1,90.0,
81,8/21/1875,1,,
82,8/22/1875,2,,
83,8/23/1875,2,,


### Convert float columns to integer

In [29]:
eds.alcohol = eds.alcohol.astype('Int64')
eds.alcohol = eds.alcohol.fillna(0)
eds.heart_tablets = eds.heart_tablets.astype('Int64')
eds.heart_tablets = eds.heart_tablets.fillna(0)
eds.head()

Unnamed: 0,date,heart_tablets,alcohol,notes
0,6/1/1875,2,0,A note
1,6/2/1875,2,177,
2,6/3/1875,2,0,
3,6/4/1875,2,0,
4,6/5/1875,2,30,


In [30]:
eds.tail()

Unnamed: 0,date,heart_tablets,alcohol,notes
79,8/19/1875,1,0,
80,8/20/1875,1,90,
81,8/21/1875,1,0,
82,8/22/1875,2,0,
83,8/23/1875,2,0,


### Convert date column to DateSeries

In [31]:
eds.date = eds.date.astype('datetime64[ns]')
eds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           84 non-null     datetime64[ns]
 1   heart_tablets  84 non-null     Int64         
 2   alcohol        84 non-null     Int64         
 3   notes          1 non-null      object        
dtypes: Int64(2), datetime64[ns](1), object(1)
memory usage: 2.9+ KB


### Convert Notes column NaN's to empty strings

In [32]:
eds['notes'] = eds['notes'].fillna('')
eds.head()

Unnamed: 0,date,heart_tablets,alcohol,notes
0,1875-06-01,2,0,A note
1,1875-06-02,2,177,
2,1875-06-03,2,0,
3,1875-06-04,2,0,
4,1875-06-05,2,30,


In [33]:
eds.tail()

Unnamed: 0,date,heart_tablets,alcohol,notes
79,1875-08-19,1,0,
80,1875-08-20,1,90,
81,1875-08-21,1,0,
82,1875-08-22,2,0,
83,1875-08-23,2,0,


In [34]:
eds.to_pickle(extra_data_pickle)