# Python Library

#### Import data from a file

In [None]:
# To import only certain columns from data file
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day']
df = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), usecols = vars_list)

# To import everything
# Avoid the first column to be used as index
df = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), index_col = False)

#### Dataframe functions

In [None]:
# Basic functions
df.head()
df.tail()
df.column
df.describe()
df.info()

# Counting data frequency in a column
df['order_hour_of_day'].value_counts(dropna=False)

# To drop a column
df_combined.drop('busiest_period_of_day', axis=1, inplace=True)

# Adding new flag column

# Using Apply function ==> Takes a long time!!!!
df_combined['price_range'] = df_combined.apply(price_label, axis=1)
# Using loc function ==> very fast!
df_combined.loc[df_combined['prices'] > 15, 'price_range_loc'] = 'High-range'
# Using list ==> very fast!
df_combined['busiest_day'] = result


#### Aggregation Functions

In [None]:
# Perform aggragation on SUBSET dataframe for AVERAGE order_number GROUPED by department_id
df.groupby('department_id').agg({'order_number': ['mean']})

# In SQL would be: 
# SELECT AVG(order_number)
# GROUPBY department_id

# Different way to achieve the same result
df.groupby('department_id')['order_number'].mean()

# Use transform function to assign the new result on max_order column
df['max_order'] = df.groupby(['user_id'])['order_number'].transform(np.max)

#### Data Wrangling

In [1]:
# Change Data Type
df['order_id'] = df['order_id'].astype('str')

# Transpose columns
df_t = df.T

# Rename a column
df.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

# Subsetting dataframe for Breakfast department using index
df_breakfast =  df_product[df_product['department_id']==14]

# Subsetting dataframe for Snacks department using isin method
df_snacks = df_product.loc[df_product['department_id'].isin([19])]
df_snacks

# Extract items from the following departments: 5,7,12,20 for party related items
df_party = df_product.loc[df_product['department_id'].isin([5,7,12,20])]
df_party


#### Visualization

In [None]:
# Crosstabs are a common tool for conducting data checks in Python. 
# Think of them as Python’s version of Excel’s pivot tables.
crosstab = pd.crosstab(ords_prods_merge['days_since_prior_order'], ords_prods_merge['order_number'], dropna = False)
crosstab.to_clipboard()

In [None]:
# Different ways to create Bar Chart
df.plot(kind = 'bar', stacked = True, color = colors)
df['order_hour_of_day'].value_counts().sort_index().plot.bar()
df[df['region'] == 'Region_4']['orders_day_of_week'].value_counts().plot(kind = 'bar', stacked = True, color = colors)

In [None]:
# Create Histogram
df['order_hour_of_day'].plot.hist(bins = 75)

In [None]:

#Key Question #5 part b
#Creating barplot
barplot = crosstab.plot.bar(rot=0)

In [None]:
#Key Question #5 part b
#Creating barplot for hour_of_day_ordered
barplot_2 = crosstab_2.plot.bar(style=['o', 'rx'])
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()

In [None]:
#Key Question #5 part b
#Creating barplot for days_since_prior_order
barplot_3 = crosstab_3.plot.bar(style=['o', 'rx'])
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()

In [None]:
#Key Question #5 part c
#Creating barplot
barplot_4 = crosstab_4.plot.bar(rot=0)

In [None]:
# Save a palette to a variable:
palette = sns.color_palette('Greens_r', 10)

In [None]:
# Use palplot and pass in the variable:
sns.palplot(palette)

In [None]:
#Create a scatter plot of the max_order to average_price
scatterplot_max_order_to_average_price = sns.scatterplot(x = 'max_order', y = 'average_price', data = df_ords_prods_all_merged, color='green')

In [None]:
#Create a bar chart from the income-group column
fig = plt.figure(figsize=(10,5))
bar_price_range = df_ords_prods_all_merged['income-group'].value_counts().plot.bar(color=['green'])
plt.xlabel('Income Group')
plt.ylabel('Order Frequency')
plt.title('Order Frequency by Income Group')
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()

In [None]:
ax = sns.boxplot(data=df_ords_prods_all_merged, x='income', y='family_status', color='green')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [None]:
# Plot the data using a bar chart
colors = ['green', 'orange']
bar_spender_region = df_grouped.plot(kind = 'bar', stacked = True, color = colors)
plt.title('Customer Spending Power in Different Regions')
plt.xlabel('Region')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.show()

In [None]:
bar_dow_loyalty = ords_prods_customers.groupby('loyalty_flag')['orders_day_of_week'].value_counts().plot.bar(title = 'Day of Orders - Loyalty Status', xlabel = 'Day of week', ylabel = 'Number of Orders', color = ['green','green','green','green','green','green','green', 'orange','orange','orange','orange','orange','orange','orange', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue'])
plt.tight_layout()

In [None]:
# Plot the data using a bar chart
colors = ['green', 'orange', 'blue']
bar_dep_loyalty = df_grouped_dep_loyal_sort.plot(kind = 'bar', stacked = True, color = colors)
plt.title('Department Popularity among Loyalty Status')
plt.xlabel('Department')
plt.ylabel('Number of Orders')
plt.tight_layout()
plt.show()

In [None]:
bar_dow_region = ords_prods_customers.groupby('region')['orders_day_of_week'].value_counts().plot.bar(title = 'Day of Orders - Region', xlabel = 'Day of week', ylabel = 'Number of Orders', color = ['green','green','green','green','green','green','green', 'orange','orange','orange','orange','orange','orange','orange', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'purple', 'purple', 'purple', 'purple', 'purple', 'purple', 'purple'])
plt.tight_layout()

In [None]:
# Plot the data using a bar chart
colors = ['green', 'orange', 'blue', 'purple']
bar_dep_region = df_grouped_dep_region_sort.plot(kind = 'bar', stacked = True, color = colors)
plt.title('Department Popularity among Region')
plt.xlabel('Department')
plt.ylabel('Number of Orders')
plt.tight_layout()
plt.show()

In [None]:
# Check the family status frequency
pie_fam_status = ords_prods_customers['fam_status'].value_counts().plot.pie(title = 'Family Status', colors = ['green', 'orange', 'blue', 'purple'])

In [None]:
# Plot the data using a bar chart
colors = ['green', 'orange', 'blue']
bar_fam_age = df_grouped_fam_age.plot(kind = 'barh', stacked = True, color = colors)
plt.title('Orders in Different Family Status and Age Groups')
plt.xlabel('Number of Customers')
plt.ylabel('Family Status')
plt.tight_layout()
plt.show()

In [None]:
# To analyze the proportions in a bar chart
colors = ['green', 'orange', 'blue']
df_prop_grouped_loyal_age = df_grouped_loyal_age.div(df_grouped_loyal_age.sum(axis = 1), axis = 0)
bar_age_loyalty_prop = df_prop_grouped_loyal_age.plot(kind = 'bar', stacked = True, color = colors)
plt.title('Proportion of Orders in Different Age Groups and Loyalty')
plt.xlabel('Age Groups')
plt.ylabel('Proportion of Customers')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Display the counts above in a bar chart
# Bar charts that show income type distributions by region

plt.figure(figsize = (10, 6))
barplot1 = sns.countplot(data = df_2, x = 'age_flag')

plt.title('Distribution of Ages Groups')
plt.xlabel('Age Groups')
plt.ylabel('Count')
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
# Display the counts above in a bar chart
# Bar charts that show income type distributions by region

plt.figure(figsize = (10, 6))
barplot4 = sns.countplot(data = df_2, x = 'department')

plt.title('Most Frequently Purchased Items')
plt.xlabel('Department')
plt.ylabel('Count')
# plt.legend(title = 'Order Type Groups', bbox_to_anchor = (1.05, 1), loc = 'upper left')
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
# Bar charts that show age group distributions by region

plt.figure(figsize = (10, 6))
barplot6 = sns.countplot(data = df_2, x = 'region', hue = 'age_flag')

plt.title('Age Group Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Count')
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Line chart for days of the week (order_dow) when a purchase was made and the age of
# the person who made the purchase (age_flag). Day '0' is Saturday.
plt.figure(figsize = (10, 5))

barplot7 = sns.lineplot(data = df_2, 
             x = 'order_dow', 
             y = 'prices', 
             hue = 'age_flag', 
             palette = 'RdYlGn',
             errorbar = None
            )

plt.title("Orders Day of Week by Age Group")
plt.xlabel("Days of the Week")
plt.ylabel("")
plt.show()

In [None]:
# Bar charts that show age group distributions by region

plt.figure(figsize = (10, 6))
barplot8 = sns.countplot(data = df_2, x = 'region', hue = 'order_type')

plt.title('Order Type by Region')
plt.xlabel('Region')
plt.ylabel('Count')
plt.legend(title = 'Order Type Groups', bbox_to_anchor = (1.05, 1), loc = 'upper left')
plt.show()