In [None]:
import pandas as pd

#### Convert column to datetime format

In [None]:
df = pd.DataFrame({'a':['2019-01-01 14:13', '2019-02-01 14:13'],
                  'b':[1546351980, 1549030380]})
df

In [None]:
df['a_asdate'] = pd.to_datetime(df['a'], format='%Y-%m-%d %H:%M')
df['b_asdate'] = pd.to_datetime(df['b'], unit='s')

In [None]:
df

In [None]:
df.dtypes

#### date manipulations

In [None]:
df = pd.DataFrame({'dates':pd.date_range(start='1/1/2018', end='1/8/2018')})
df

In [None]:
df['year'] = df['dates'].dt.year
df['month'] = df['dates'].dt.month
df['day'] = df['dates'].dt.day
df['dayofweek'] = df['dates'].dt.dayofweek
df['weekday_name'] = df['dates'].dt.weekday_name
df['is_leap_year'] = df['dates'].dt.is_leap_year
# see more https://pandas.pydata.org/pandas-docs/version/0.21/api.html#datetimelike-properties

In [None]:
df

#### Filtering

In [None]:
df = pd.DataFrame({'a':[1,1,1,2,2,3,4,5],
                  'b':[10,10,11,20,0,0,40,50],
                  'c':['apple','apple','plum','pear','plum','apple','apple','apple']})
df

In [None]:
df['a']<3

In [None]:
df[df['a']<3]

In [None]:
only_apple = df[df['c']=='apple']
only_apple

In [None]:
df[(df['b']==10) & (df['c']=='apple')]

In [None]:
df[df['a']<df['b']]

#### Grouping

In [None]:
movies = pd.read_csv('movies.csv')

In [None]:
movies.groupby('title').size()

In [None]:
movies.groupby('title').size().sort_values(ascending=False).head()

In [None]:
movies.groupby('title')['rating'].mean().sort_values(ascending=False).head()

In [None]:
movies.groupby('title').agg({'rating':'mean',
                            'userId':'nunique'})

In [None]:
ratings = movies.groupby('title').agg({'rating':'mean',
                            'userId':'nunique'}).rename(columns={'rating':'rating_mean',
                                                                'userId':'nb_rating'})

In [None]:
ratings.sort_values('rating_mean', ascending=False).head()

#### Bar Plot

In [None]:
%matplotlib inline

In [None]:
ratings_by_genre = movies.groupby('genre')['rating'].mean()

In [None]:
ratings_by_genre.plot(kind='bar')

In [None]:
ratings_by_genre.sort_values().plot(kind='bar')

In [None]:
ratings_by_genre.sort_values().plot(kind='bar', title='Mean raging by genre')

In [None]:
ratings_by_genre.sort_values().plot(kind='bar', title='Mean raging by genre', figsize=(14,4))

In [None]:
ratings_by_genre.sort_values().plot(kind='bar', title='Mean raging by genre', figsize=(14,4), color='lime')

#### Pivoting

In [None]:
by_year_genre = movies.groupby(['year','genre'])['rating'].mean()

In [None]:
by_year_genre.head()

In [None]:
by_year_genre = movies.groupby(['year','genre'])['rating'].mean().reset_index()

In [None]:
by_year_genre.head()

In [None]:
by_year_genre.sort_values('year', inplace=True)

In [None]:
by_year_genre

In [None]:
pivoted_df = by_year_genre.pivot(index='year', columns='genre', values='rating')

In [None]:
pivoted_df.head()

#### plotting groups

In [None]:
pivoted_df.plot(kind='bar', figsize=(15,5), title='Average ratings by year of premiere and genre')

In [None]:
pivoted_df.plot(kind='bar', 
                figsize=(15,5), 
                title='Average ratings by year of premiere and genre',
               colormap='winter')

#### Figures with multiple plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
by_year_genre2 = movies.groupby(['year','genre'])['userId'].nunique().reset_index().sort_values('year')
by_year_genre2

In [None]:
pivoted_df2 = by_year_genre2.pivot(index='year', columns='genre', values='userId')
pivoted_df2

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
fig.suptitle('Ratings by year of premiere and genre')
pivoted_df.plot(kind='bar',  
                title='Average',
                colormap='winter',
                ax=axes[0]
                )
pivoted_df2.plot(kind='bar', 
                figsize=(15,5), 
                title='Number',
               colormap='summer',
               ax=axes[1])
plt.show()

In [None]:
pivoted_df[pivoted_df.index.isin(['(2010)', '(2011)'])]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.suptitle('Ratings by year of premiere and genre')
pivoted_df[pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                title='Average',
                colormap='winter',
                ax=axes[0,0]
                )
pivoted_df2[pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                title='Number',
               colormap='summer',
               ax=axes[0,1])
pivoted_df[~pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                title='Average',
                colormap='winter',
                ax=axes[1,0]
                )
pivoted_df2[~pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                figsize=(15,5), 
                title='Number',
               colormap='summer',
               ax=axes[1,1])
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.suptitle('Ratings by year of premiere and genre')
pivoted_df[pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                title='Average',
                colormap='winter',
                ax=axes[0,0]
                )
pivoted_df2[pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                title='Number',
               colormap='summer',
                legend=False,
               ax=axes[0,1])
pivoted_df[~pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                title='Average',
                colormap='winter',
                legend=False,
                ax=axes[1,0]
                )
pivoted_df2[~pivoted_df.index.isin(['(2010)', '(2011)'])].plot(kind='bar', 
                figsize=(20,15), 
                title='Number',
               colormap='summer',
               ax=axes[1,1])
plt.show()

## Exercise

#### Find out if there is a difference between Subscriber/Customer in daily routine

### 1 - exercise

Load the trip csv into a pandas dataframe. <br>
In the trip dataframe create a new column called 'start_datetime'. These should contain the values from column 'Start Date' converted into datetime. 

In [None]:
# Your code here


### 1 - check yourself

In [None]:
if (trip['Start Date'] != trip['start_date'].dt.strftime('%-m/%-d/%Y %-H:%M')).sum() == 0:
    print('start_date column is successfully converted')
else:
    print('start_date column is NOT successfully converted')

### 2 - exercise
Create new column called start_hour that contains the hour in which the trip started.

In [None]:
# Your code here


### 2 - check yourself

In [None]:
if trip['start_hour'].dtype == 'int64' and trip['start_hour'].min()==0 and trip['start_hour'].max()==23:
    print('start_hour column is successfully converted')
else:
    print('start_hour column is NOT successfully converted')    

### 3 - exercise
First let's check if there is a daily pattern. Group the trip data by the start_hour column and get the size of each group. Assign it to a new variable called by_hour.

In [None]:
# Your code here


### 3 - check yourself

In [None]:
if by_hour.index[0]==0 and by_hour[0] == 620:
    print('the grouping was successfull')
else:
    print('the grouping was NOT successfull')    

### 4 - exercise
Make a bar plot from the by_hour data. 
- The x axis should be the hour and the y the number of trips. 
- The chart should have a title
- The chart should be 10 inches wide and 5 inches tall
- The color of the columns should set to something different from the default. (You can find named colors for example here: https://stackoverflow.com/questions/22408237/named-colors-in-matplotlib)

In [None]:
# Your code here


### 4 - check yourself
Your chart should be similar to this:
<img src="exc4_cahrt.png">

### 5 - exercise
It looks like that the bikes are used most in the morning and in the afternoon. Let's check if this pattern is the same for Customers and Subscriebrs as well! <br>
Group the trip dataframe by Subscription Type ahd start_hour and get the size of each group. Reset the index and assign it to a new dataframe called by_type_hour. <br>
Rename the column with the group size to nb_trips.

In [None]:
# Your code here


### 5 - check yourself

In [None]:
if by_type_hour.columns[2] == 'nb_trips' and len(by_type_hour) == 48:
    print('the grouping was successfull')
else:
    print('the grouping was NOT successfull')       

### 6 - exercise
Pivot the by_type_hour dataframe. The pivoted dataframe should be called by_type_hour_pivot. The indeces should be the hours, the columns the subscription types (Customer and Subscriber) and the values the number of trips.

In [None]:
# Your code here


### 6 - check yourself

In [None]:
if by_type_hour_pivot.columns.name == 'Subscription Type' and len(by_type_hour_pivot) == 24:
    print('the pivoting was successfull')
else:
    print('the pivoting was NOT successfull')      

### 7 - exercise

Make a bar plot from the by_type_hour_pivot data.

- The x axis should be the hour and the y the number of trips.
- The chart should have a title
- The chart should be 10 inches wide and 5 inches tall
- The Customer and the Subscriber bars should have different colors from one colormap other than the default. (You can find named colormaps for example here: https://matplotlib.org/tutorials/colors/colormaps.html)

In [None]:
# Your code here


### 7 - check yourself
Your chart should be similar to this:
<img src="exc7_chart.png">

### 8 - exercise

Ok, it looks like that Subscribers use the bikes for commuting while Customers use the bikes more during the day. Let's see if these patterns are different in the weekend and on weekdays! <br>
As we are about to introduce a new grouping variable we can't use the aggregated data we created in the last tasks. So let's go back to the trip dataframe and create a new column called day_of_week with the name of the day of the start_date. Remember you can use the built-in methods of the .dt properties of the datetime like columns. 

In [None]:
# Your code here


### 8 - check yourself

In [None]:
if sorted(trip['day_of_week'].unique()) == ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']:
    print('the column creation was successfull')
else:
    print('the column creation was NOT successfull')      

### 9 - exercise
We know the name of the days of each date now. Let's crate a column called weekend. If the name of the day is Saturday or Sunday, the value of weekend should be True. Otherwise it should be False.

In [None]:
# Your code here


### 9 - check yourself

In [None]:
if trip['weekend'].sum() == 22980:
    print('the column creation was successfull')
else:
    print('the column creation was NOT successfull')      

### 10 - exercise
Create a figure with two charts next to each others. 
- One should show the number of trips by hours for Customers and Subsribers on the weekends and the other on the weekdays. 
- Both charts should have a title and they should have a shared y axes. 
- There should be a shared title in the middle above the charts <br><br>
For creating the charts you have to filter the trip data for weekend/weekday first and than pivot the new dataframe. <br><br>
As an advanced task you can make a figure with 4 plots: in the first row the weekends and weekdays should be separate charts with differenc colored bars for Customers and Subscribers. In the second row, the columns should be for the weekends/weekdays on separate charts for Customers and Subscribers. The charts should share both axes and only the charts on the left should have the legend explanation box

In [None]:
# Your code here


### 7 - check yourself
Your chart should be similar to this:
<img src="exc10_chart.png">
<br><br>Or to this if you chose the advanced task:
<img src="exc10_chart_advance.png">

### +1 - exercise
Include the month of the start_date in the analysis and create a figure showing some insight. <br>
Save the figure as a picture.

In [None]:
# Your code here
