# Python Foundation: Class 7 topics

> 1. Type conversion functions:
    - Datetime
    - int/float
    - object
    
> 2. Plots/Graphs in python
    - Using pandas plot
    - Using pyplot from matplotlib
    - Using seaborn
    - --------------------------------------
    - plotnine/ggplot
    - Bokeh
    - plotly

In [None]:
import pandas as pd

In [None]:
# import the transaction data 
q1 = pd.read_csv("D:/SampleData/POS_Q1.csv")

## Type conversion in pandas

In [None]:
# import the transaction data 
pos_q1 = pd.read_csv("D:/SampleData/POS_Q1.csv")

In [None]:
pos_q1.info()

In [None]:
# convert date column in datetime
pos_q1.Date

In [None]:
# convert the Date column into datetime from object type
pos_q1['Date'] = pd.to_datetime(pos_q1.Date, format = '%m-%d-%y %H:%M')

In [None]:
# convert following dates in to datetime
d1 = pd.Series(['2309 2012', '0309 2019'])
d2 = pd.Series(['12:45:55 pm 02232002', '11:40:55 pm 06022002', '12:04:55 am 11302002'])

In [None]:
pd.to_datetime(d1, format = '%d%m %Y')

In [None]:
pd.to_datetime(d2, format = '%I:%M:%S %p %m%d%Y')

In [None]:
# convert following non standard numerics into numeric
n1 = pd.Series(['2012.2', '0309'])

In [None]:
pd.to_numeric(n1)

In [None]:
# convert following values to text/object
f1 = pd.Series([23.4, 65.7, 99,3])

In [None]:
f1.astype(str)

### Timestamp/DateOffset

In [None]:
# get the current system date and time
pd.Timestamp.now()

In [None]:
# add one day in current date
pd.Timestamp.now() + pd.DateOffset(days = 1)

### map / apply / applymap
> 1. map: works on each element of series
> 2. applymap: works on each element within a dataframe
> 3. apply works on series or individual item depending on its use

In [None]:
# convert Location variable from stores dataset into upper case (map)

# convert Store type and Location variables from stores dataset into upper case (applymap)

# convert all object variables from stores dataset into upper case (applymap)

# get the sum of all numerical variables from stores dataset

# create a UDF that takes series as an input and return stats summary of data for int variables (.describe function)

In [None]:
# convert Location variable from stores dataset into upper case (map)
pd.Series(map(lambda x: x.upper(), stores.Location))

In [None]:
# convert Store type and Location variables from stores dataset into upper case
stores[['Location', 'StoreType']].applymap(lambda x: x.upper())

In [None]:
# convert all object variables from stores dataset into upper case
stores.loc[:, stores.dtypes == 'object'].applymap(lambda x: x.upper())

In [None]:
# get the sum of all numerical variables from stores dataset
stores.loc[:, stores.dtypes == 'int64'].apply(lambda x: x.sum())

In [None]:
# describe function
def fn_describe(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), 
                          x.dropna().quantile(0.01), x.dropna().quantile(0.05), x.dropna().quantile(0.10), 
                              x.dropna().quantile(0.25), x.dropna().quantile(0.50), x.dropna().quantile(0.75), 
                                  x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,
                             'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [None]:
stores.loc[:, stores.dtypes == 'int64'].apply(lambda x: fn_describe(x))

### data visualization in python - introduction

## Getting started with charts

In [None]:
import pandas as pd

In [None]:
# import matplotlib.pyplot as plt
from matplotlib import pyplot as plt

In [None]:
# import the sales  data 
stores = pd.read_csv("D:/SampleData/stores.csv")

In [None]:
stores.TotalSales.describe()

In [None]:
%matplotlib inline
#notebook/inline

## Plots/Charts in python
(composition / distribution / relation)

In [None]:
# import pandas
import pandas as pd

In [None]:
# import the stores data 
stores = pd.read_csv("D:/SampleData/Stores.csv")

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
import seaborn as sns

### Distribution Charts
> 1. Histogram
> 2. Scatter plot (one variable)
> 3. Box plot

#### distribution charts: histogram

In [None]:
# pandas
stores.TotalSales.hist(grid = False, bins = 10, edgecolor = '#000000', color = 'red')

In [None]:
# .plot funtion
stores.TotalSales.plot('hist')

In [None]:
# pyplot
plt.hist(stores.TotalSales, bins = 10, edgecolor = 'black')

In [None]:
# seaborn
sns.distplot(stores.TotalSales, bins = 10, vertical = False)

#### distribution charts: scatterplot (one variable)

In [None]:
# pandas dataframe: not to be used for one variable
stores.plot(x = 'TotalSales', y = 'TotalSales',  kind = 'scatter')

In [None]:
# pyplot
plt.scatter(x = stores.TotalSales.index, y = stores.TotalSales, c = 'y', s = 50)

# c = 'yellow'
# s = 50
# marker = + or . or * or ^, ,edgecolors="black"

In [None]:
# plt.grid(True)
# plt.xlabel("Index of TotalSales")
# plt.ylabel("TotalSales")
# plt.title("Distribution of TotalSales in the data")
# plt.show()

In [None]:
# seaborn
sns.scatterplot(x = stores.TotalSales.index, y = stores.TotalSales)

#### distribution charts: box plots

In [None]:
import numpy as np

In [None]:
# Getting random data
data = pd.Series(np.random.normal(loc = 110, scale = 20, size = 10000)) # loc: mean, scale = std/spread/width
data.head()

In [None]:
# using pandas
data.plot(kind = 'box')

In [None]:
# pyplot
plt.boxplot(data)
plt.show()

In [None]:
# seaborn
sns.boxplot(data, orient = 'v')

### Relationship charts
> 1. scatter plot
> 2. bar/coloumn charts
> 3. line charts
> 4. area charts

#### relationship charts: scatterplot (two variable)

In [None]:
# pandas
stores.plot(x = 'TotalSales', y = 'OperatingCost', kind = 'scatter')

In [None]:
# pyplot
plt.scatter(x = stores.OperatingCost, y = stores.TotalSales)
plt.xlabel("Operating Cost")
plt.ylabel("Total Sales")
plt.title("Relation between Operating Cost and TotalSales")
plt.show()

In [None]:
# seaborn
sns.scatterplot(x = stores.OperatingCost, y = stores.TotalSales, hue = stores.Location)

In [None]:
# lmplot from seaborn
sns.lmplot(x = 'OperatingCost', y = 'TotalSales', data = stores)

#### relationship charts: column/bar/line/area chart

In [None]:
# create a bar chart to compare the sales for each location
summ = stores.groupby('Location').TotalSales.sum().reset_index()

In [None]:
# using pyplot
plt.bar(x = 'Location', height = 'TotalSales', data = summ, color = 'red', edgecolor = 'black', width = .6)

In [None]:
# using seaborn
sns.barplot(x = summ.Location, y = summ.TotalSales)

### Composition Charts
> 1. pie charts

#### composition charts: pie

In [None]:
# get the sales for each location
summ = stores.groupby('Location').TotalSales.sum().reset_index()

In [None]:
# pandas
summ.plot(x = 'Location', y = 'TotalSales', kind = 'pie', legend = True, title = 'This is an example of pie chart')

In [None]:
# pyplot
plt.pie(x = 'TotalSales', labels = 'Location', data = summ, autopct='%.2f%%')
plt.title('This is an example of pie chart')
plt.show()

## Multivariate analysis

In [None]:
# Bivariate / Multivariate Bar graphs
ds_summary1 = stores.groupby(['Location', 'StoreType'])['TotalSales'].sum()
ds_summary2 = ds_summary1.reset_index()

In [None]:
# using pandas
ds_summary1.plot(kind = 'bar')

In [None]:
# column / bar chart
ds_summary3 = ds_summary2.pivot('Location', 'StoreType', 'TotalSales')

In [None]:
ds_summary3

In [None]:
# clustered column / clustered bar chart
ds_summary3.plot(kind = 'bar', color = ['blue', 'violet', 'cyan'])

In [None]:
# stacked column / stacked bar chart
ds_summary3.plot(kind = 'bar', stacked = True)

In [None]:
ds_summary2

In [None]:
# clustered charts Seaborn
sns.barplot(x = 'Location', y = 'TotalSales', data = ds_summary2 , hue = 'StoreType')

# palette = ['blue', 'violet', 'cyan']
# order = ['Mumbai', 'Delhi', 'Chennai', 'Kolkata']