# Week 3 Data Analytics
# Pandas DataFrames

# Exploration and manipulation

In [1]:
# import libraries

In [3]:
import pandas as pd
import numpy as np


# Create a pandas series

In [4]:
pandas_series_data = pd.Series([1,2,3,4,np.nan,5,6])


In [5]:
print(pandas_series_data)

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
5    5.0
6    6.0
dtype: float64


# Create a pandas DataFrame

In [6]:
data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard'],
    'Price': [1200, 25, 75],
    'InStock': [True, True, False]
}
pandas_df_data = pd.DataFrame(data)

In [7]:
data

{'Product': ['Laptop', 'Mouse', 'Keyboard'],
 'Price': [1200, 25, 75],
 'InStock': [True, True, False]}

In [6]:
print(pandas_df_data)

    Product  Price  InStock
0    Laptop   1200     True
1     Mouse     25     True
2  Keyboard     75    False


In [8]:
pandas_df_data

Unnamed: 0,Product,Price,InStock
0,Laptop,1200,True
1,Mouse,25,True
2,Keyboard,75,False


# Importing Data 

In [9]:
sales_data_df = pd.read_csv('sales_data.csv')

In [10]:
# print first 5 rows
sales_data_df.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Paper,634.86,1,-178.47
1,Furniture,Storage,59.34,2,190.48
2,Technology,Paper,904.87,7,-0.89
3,Technology,Computers,910.3,4,21.95
4,Furniture,Accessories,782.41,8,246.8


In [12]:
# print first 10 rows
sales_data_df.head(8)

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Paper,634.86,1,-178.47
1,Furniture,Storage,59.34,2,190.48
2,Technology,Paper,904.87,7,-0.89
3,Technology,Computers,910.3,4,21.95
4,Furniture,Accessories,782.41,8,246.8
5,Furniture,Chairs,16.96,3,129.03
6,Technology,Phones,66.71,6,-195.97
7,Office Supplies,Phones,232.97,9,271.59


# Exploring Methods

In [11]:
# summary of the dataset
sales_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Category      994 non-null    object 
 1   Sub-Category  994 non-null    object 
 2   Sales         994 non-null    float64
 3   Quantity      994 non-null    int64  
 4   Profit        994 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 39.0+ KB


In [13]:
# descriptive statistics
sales_data_df.describe()

Unnamed: 0,Sales,Quantity,Profit
count,994.0,994.0,994.0
mean,507.952636,4.997988,45.519577
std,289.783969,2.592109,145.267644
min,10.19,1.0,-199.91
25%,253.6225,3.0,-80.8425
50%,518.655,5.0,42.51
75%,762.9425,7.0,172.7375
max,999.72,9.0,298.81


# Selecting and filtering data

In [17]:
# label based filter
sales_data_df.loc[0:4, 'Sales']

0    634.86
1     59.34
2    904.87
3    910.30
4    782.41
Name: Sales, dtype: float64

In [18]:
sales_data_df.head(2)

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Paper,634.86,1,-178.47
1,Furniture,Storage,59.34,2,190.48


In [22]:
# position based filter
sales_data_df.iloc[0:5, 2]

0    634.86
1     59.34
2    904.87
3    910.30
4    782.41
Name: Sales, dtype: float64

In [23]:
sales_data_df.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Paper,634.86,1,-178.47
1,Furniture,Storage,59.34,2,190.48
2,Technology,Paper,904.87,7,-0.89
3,Technology,Computers,910.3,4,21.95
4,Furniture,Accessories,782.41,8,246.8


# Conditional filtering 


In [24]:
# filter raws where Sales are greater than 500?
high_sales = sales_data_df[sales_data_df['Sales'] > 500]
high_sales.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Paper,634.86,1,-178.47
2,Technology,Paper,904.87,7,-0.89
3,Technology,Computers,910.3,4,21.95
4,Furniture,Accessories,782.41,8,246.8
10,Technology,Bookcases,923.4,8,184.06


In [25]:
# filter rows with multiple conditions
profitable_high_sales = high_sales[
    (high_sales['Sales'] > 500) & (high_sales['Profit'] > 0)
]
profitable_high_sales.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
3,Technology,Computers,910.3,4,21.95
4,Furniture,Accessories,782.41,8,246.8
10,Technology,Bookcases,923.4,8,184.06
18,Office Supplies,Bookcases,789.26,3,242.18
19,Office Supplies,Paper,975.09,9,79.7


# Sorting DataFrames

In [26]:
# sort by a single column
df_sorted = high_sales.sort_values(by='Sales', ascending=False)
df_sorted.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
471,Office Supplies,Paper,999.72,5,-58.32
941,Office Supplies,Bookcases,999.55,1,131.68
659,Office Supplies,Chairs,999.47,4,-67.0
143,Office Supplies,Tables,997.11,6,52.31
520,Furniture,Bookcases,996.16,9,-115.06


In [27]:
# sort by a multiple column
# Category by asc, profit by desc

df_multi_sorted = high_sales.sort_values(by=['Category','Profit'], ascending=[True, False])
df_multi_sorted.head()



Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
795,Furniture,Storage,898.7,4,297.74
492,Furniture,Phones,748.4,9,297.29
231,Furniture,Binders,631.96,5,297.21
422,Furniture,Paper,843.31,3,293.87
772,Furniture,Paper,543.42,8,290.62


# Adding calculated columns

In [28]:
sales_data_df['Profit_Margin'] = (sales_data_df['Profit'] / sales_data_df['Sales']) * 100


In [29]:
sales_data_df[['Sales', 'Profit', 'Profit_Margin']].head()

Unnamed: 0,Sales,Profit,Profit_Margin
0,634.86,-178.47,-28.11171
1,59.34,190.48,320.997641
2,904.87,-0.89,-0.098357
3,910.3,21.95,2.411293
4,782.41,246.8,31.543564
