In [3]:
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
donations_df = pd.read_csv(f"{DATA_FOLDER}/raw_donations.csv")
donations_df.head()

Unnamed: 0,id,created_at,amount
0,00000ce845c00cbf0686c992fc369df4,2013-12-17 21:47:14,50.0
1,00002783bc5d108510f3f9666c8b1edd,2016-02-02 18:34:27,99.0
2,00002d44003ed46b066607c5455a999a,2016-10-25 20:15:11,10.0
3,00002d44003ed46b066607c5455a999a,2017-01-16 01:11:20,15.51
4,00002d44003ed46b066607c5455a999a,2017-01-16 14:20:10,100.0


In [5]:
donations_df['created_at'] = pd.to_datetime(donations_df.created_at)

# access the first row, and the created_at column only
first_row_created_at = donations_df.loc[0, 'created_at']

# some fancy printing for better formatting
print(f"First row created at:\n\n"
      f"\t{'Year':<6}: {first_row_created_at.year}\n"
      f"\t{'Month':<6}: {first_row_created_at.month}\n"
      f"\t{'Day':<6}: {first_row_created_at.day}\n"
      f"\t{'Hour':<6}: {first_row_created_at.hour}\n"
      f"\t{'Minute':<6}: {first_row_created_at.minute}\n"
      f"\t{'Second':<6}: {first_row_created_at.second}"
)

First row created at:

	Year  : 2013
	Month : 12
	Day   : 17
	Hour  : 21
	Minute: 47
	Second: 14


Just print out the year, month, and day from the single value

In [6]:
# Just print out the year, month, and day from the single value
first_row_created_at.year, first_row_created_at.month, first_row_created_at.day

(2013, 12, 17)

In [7]:
type(first_row_created_at)

pandas._libs.tslibs.timestamps.Timestamp

In [8]:
donations_df['year'] = donations_df.created_at.dt.year
donations_df['month'] = donations_df.created_at.dt.month
donations_df['day'] = donations_df.created_at.dt.day
donations_df['hour'] = donations_df.created_at.dt.hour
donations_df['minute'] = donations_df.created_at.dt.minute
donations_df['second'] = donations_df.created_at.dt.second

donations_df.head()

Unnamed: 0,id,created_at,amount,year,month,day,hour,minute,second
0,00000ce845c00cbf0686c992fc369df4,2013-12-17 21:47:14,50.0,2013,12,17,21,47,14
1,00002783bc5d108510f3f9666c8b1edd,2016-02-02 18:34:27,99.0,2016,2,2,18,34,27
2,00002d44003ed46b066607c5455a999a,2016-10-25 20:15:11,10.0,2016,10,25,20,15,11
3,00002d44003ed46b066607c5455a999a,2017-01-16 01:11:20,15.51,2017,1,16,1,11,20
4,00002d44003ed46b066607c5455a999a,2017-01-16 14:20:10,100.0,2017,1,16,14,20,10


In [9]:
donations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4687884 entries, 0 to 4687883
Data columns (total 9 columns):
id            object
created_at    datetime64[ns]
amount        float64
year          int64
month         int64
day           int64
hour          int64
minute        int64
second        int64
dtypes: datetime64[ns](1), float64(1), int64(6), object(1)
memory usage: 321.9+ MB


In [10]:
def assign_time_of_the_day(hour):
    if hour < 12:
        return 'morning'
    elif hour < 18:
        return 'midday'
    return 'evening'


donations_df['time_of_the_day'] = donations_df.hour.apply(assign_time_of_the_day)
donations_df[['hour', 'time_of_the_day']].head(20)

Unnamed: 0,hour,time_of_the_day
0,21,evening
1,18,evening
2,20,evening
3,1,morning
4,14,midday
5,14,midday
6,15,midday
7,18,evening
8,14,midday
9,18,evening


## Seasons?
Based on [this](https://www.timeanddate.com/calendar/aboutseasons.html#targetText=Spring%20runs%20from%20March%201,29%20in%20a%20leap%20year).

* Spring runs from March 1 to May 31;
* Summer runs from June 1 to August 31;
* Fall (autumn) runs from September 1 to November 30; and

* Winter runs from December 1 to February 28 (February 29 in a leap year).

In [11]:
def assign_season(month: int):
    if month >= 3 and month < 6:
        return 'spring'
    elif month >= 6 and month < 9:
        return 'summer'
    elif month >= 9 and month < 12:
        return 'fall'
    return 'winter'


donations_df['season'] = donations_df.created_at.dt.month.apply(assign_season)
donations_df[['month', 'season']].head(20)

Unnamed: 0,month,season
0,12,winter
1,2,winter
2,10,fall
3,1,winter
4,1,winter
5,1,winter
6,1,winter
7,2,winter
8,2,winter
9,3,spring


In [12]:
donations_df.columns

Index(['id', 'created_at', 'amount', 'year', 'month', 'day', 'hour', 'minute',
       'second', 'time_of_the_day', 'season'],
      dtype='object')

## Amount Categories
We will say:

```
1. Giving $1 or less
2. Up to $5
3. Up to $20
4. Up to $50
5. Up to $100
6. Everything else
```

In [13]:
def assign_amount_category(amount):
    if amount <= 1:
        return 1
    elif amount <= 5:
        return 2
    elif amount <= 20:
        return 3
    elif amount <= 50:
        return 4
    elif amount <= 100:
        return 5
    return 6


donations_df['category'] = donations_df.amount.apply(assign_amount_category)
donations_df[['amount', 'category']].head(20)

Unnamed: 0,amount,category
0,50.0,4
1,99.0,5
2,10.0,3
3,15.51,3
4,100.0,5
5,9.69,3
6,13.75,3
7,150.0,6
8,10.0,3
9,100.0,5


In [15]:
donations_df.to_csv('../../data/processed_donations.csv', index=False)