# Week 3 Data Analytics
# Pandas DataFrames

# Exploration and manipulation

In [1]:
# import libraries

In [1]:
import pandas as pd
import numpy as np


# Create a pandas series

In [2]:
pandas_series_data = pd.Series([1,2,3,4,np.nan,5,6])


In [3]:
print(pandas_series_data)

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
5    5.0
6    6.0
dtype: float64


# Create a pandas DataFrame

In [4]:
data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard'],
    'Price': [1200, 25, 75],
    'InStock': [True, True, False]
}
pandas_df_data = pd.DataFrame(data)

In [5]:
data

{'Product': ['Laptop', 'Mouse', 'Keyboard'],
 'Price': [1200, 25, 75],
 'InStock': [True, True, False]}

In [6]:
print(pandas_df_data)

    Product  Price  InStock
0    Laptop   1200     True
1     Mouse     25     True
2  Keyboard     75    False


In [7]:
pandas_df_data

Unnamed: 0,Product,Price,InStock
0,Laptop,1200,True
1,Mouse,25,True
2,Keyboard,75,False


# Importing Data 

In [8]:
sales_data_df = pd.read_csv('sales_data.csv')

In [9]:
# print first 5 rows
sales_data_df.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Printers,1199.19,1,-130.02
1,Technology,Accessories,604.32,1,-77.02
2,Technology,Laptops,695.77,6,142.39
3,Technology,Phones,1630.08,1,161.28
4,Office Supplies,Paper,1172.57,6,361.32


In [15]:
# print first 10 rows
sales_data_df.head(10)
sales_data_df.tail(10)

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
90,Furniture,Chairs,406.17,3,37.62
91,Office Supplies,Paper,655.16,1,-83.75
92,Furniture,Chairs,1598.86,2,301.72
93,Office Supplies,Art,718.07,10,38.94
94,Office Supplies,Paper,1114.45,4,6.99
95,Technology,Phones,677.82,6,-109.12
96,Furniture,Furnishings,315.92,8,-52.26
97,Technology,Accessories,67.85,8,-11.75
98,Furniture,Bookcases,532.62,7,195.54
99,Furniture,Bookcases,1894.15,3,-231.09


# Exploring Methods

In [16]:
# summary of the dataset
sales_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Category      100 non-null    object 
 1   Sub-Category  100 non-null    object 
 2   Sales         100 non-null    float64
 3   Quantity      100 non-null    int64  
 4   Profit        100 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 4.0+ KB


In [17]:
# descriptive statistics
sales_data_df.describe()

Unnamed: 0,Sales,Quantity,Profit
count,100.0,100.0,100.0
mean,1005.38,5.04,71.2812
std,563.24066,2.912599,206.729817
min,67.85,1.0,-343.72
25%,578.5175,2.0,-66.17
50%,922.28,5.0,21.59
75%,1532.73,7.0,172.58
max,1985.44,10.0,686.25


# Selecting and filtering data

In [18]:
# label based filter
sales_data_df.loc[0:4, 'Sales']

0    1199.19
1     604.32
2     695.77
3    1630.08
4    1172.57
Name: Sales, dtype: float64

In [19]:
sales_data_df.head(2)

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Printers,1199.19,1,-130.02
1,Technology,Accessories,604.32,1,-77.02


In [20]:
# position based filter
sales_data_df.iloc[0:5, 2]

0    1199.19
1     604.32
2     695.77
3    1630.08
4    1172.57
Name: Sales, dtype: float64

In [21]:
sales_data_df.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Printers,1199.19,1,-130.02
1,Technology,Accessories,604.32,1,-77.02
2,Technology,Laptops,695.77,6,142.39
3,Technology,Phones,1630.08,1,161.28
4,Office Supplies,Paper,1172.57,6,361.32


# Conditional filtering 


In [27]:
# filter raws where Sales are greater than 500?
high_sales = sales_data_df[sales_data_df['Sales'] > 500]
quantity_higher_5 = sales_data_df[sales_data_df['Quantity'] > 5]
print(high_sales.head())
print('======='*10)
print(quantity_higher_5.head())
print('======='*10)
print(sales_data_df['Quantity'])
print('======='*10)
print(sales_data_df['Sales'])


          Category Sub-Category    Sales  Quantity  Profit
0       Technology     Printers  1199.19         1 -130.02
1       Technology  Accessories   604.32         1  -77.02
2       Technology      Laptops   695.77         6  142.39
3       Technology       Phones  1630.08         1  161.28
4  Office Supplies        Paper  1172.57         6  361.32
           Category Sub-Category    Sales  Quantity  Profit
2        Technology      Laptops   695.77         6  142.39
4   Office Supplies        Paper  1172.57         6  361.32
8   Office Supplies        Paper  1679.26         7   -3.79
10        Furniture    Bookcases  1920.16         9  494.82
12       Technology      Laptops   362.94        10  -68.63
0     1
1     1
2     6
3     1
4     6
     ..
95    6
96    8
97    8
98    7
99    3
Name: Quantity, Length: 100, dtype: int64
0     1199.19
1      604.32
2      695.77
3     1630.08
4     1172.57
       ...   
95     677.82
96     315.92
97      67.85
98     532.62
99    1894.15
Na

In [25]:
# filter rows with multiple conditions
profitable_high_sales = high_sales[
    (high_sales['Sales'] > 500) & (high_sales['Profit'] > 0)
]
profitable_high_sales.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
3,Technology,Computers,910.3,4,21.95
4,Furniture,Accessories,782.41,8,246.8
10,Technology,Bookcases,923.4,8,184.06
18,Office Supplies,Bookcases,789.26,3,242.18
19,Office Supplies,Paper,975.09,9,79.7


# Sorting DataFrames

In [28]:
# sort by a single column
df_sorted = high_sales.sort_values(by='Sales', ascending=False)
df_sorted.head()

Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
52,Technology,Phones,1985.44,3,136.09
22,Technology,Printers,1978.2,1,-233.97
34,Office Supplies,Storage,1952.91,10,289.07
10,Furniture,Bookcases,1920.16,9,494.82
61,Technology,Phones,1917.25,5,140.88


In [33]:
# sort by a multiple column
# Category by asc, profit by desc

df_multi_sorted = high_sales.sort_values(by=['Category','Quantity'], ascending=[False, True])
df_multi_sorted.head()



Unnamed: 0,Category,Sub-Category,Sales,Quantity,Profit
0,Technology,Printers,1199.19,1,-130.02
1,Technology,Accessories,604.32,1,-77.02
3,Technology,Phones,1630.08,1,161.28
22,Technology,Printers,1978.2,1,-233.97
37,Technology,Laptops,1745.91,1,579.45


# Adding calculated columns

In [34]:
sales_data_df['Profit_Margin'] = (sales_data_df['Profit'] / sales_data_df['Sales']) * 100


In [35]:
sales_data_df[['Sales', 'Profit', 'Profit_Margin']].head()

Unnamed: 0,Sales,Profit,Profit_Margin
0,1199.19,-130.02,-10.842319
1,604.32,-77.02,-12.744903
2,695.77,142.39,20.465096
3,1630.08,161.28,9.893993
4,1172.57,361.32,30.814365
