### Masks, Comparisons and Boolean Operators in Numpy

In [1]:
import numpy as np

weights = np.array([52, 43, 61, 30, 72, 51, 28, 64])

# How do we filter this array of weights to give us 'obese' patients? Using masks

weights

array([52, 43, 61, 30, 72, 51, 28, 64])

In [2]:
obese_mask = weights > 60
# Mask is a special type of array where the values are Boolean
obese_mask

array([False, False,  True, False,  True, False, False,  True])

In [3]:
# Applying the mask

obese_patients = weights[obese_mask]
obese_patients

array([61, 72, 64])

In [4]:
# Moderate weight mask

moderate_weight_mask = (weights >= 40) & (weights <= 60)
moderate_weight_mask

array([ True,  True, False, False, False,  True, False, False])

In [5]:
#  Inverting the obese mask
not_obese_mask = ~(weights > 60)
not_obese_mask

array([ True,  True, False,  True, False,  True,  True, False])

# Fundamentals of Pandas

1. Data structures in pandas
2. File formats and importing data into pandas
3. Basic inspection of a df
4. Indexing/slicing of df

#### Data structures 
1. Series - one-dimensional array
2. Dataframe (can be considered as a 2d array)

In [6]:
# File formats: csv, tsv, excel files, data from urls

import pandas as pd

# For excel files
# df = pd.read_excel('filename.xlsx')

# For csv
# df = pd.read_csv('filename.csv')

# For data from a URL
# url = "url"
# df = pd.read_csv(url)

# For tab-separated value files
# df = pd.read_csv('file.tsv', sep='\t')

#### Basic inspection of a dataframe

In [7]:
# Check if the imporatation of data is successful
sales_df = pd.read_csv(r"C:\Users\RMwaura\Downloads\Sales.csv")
sales_df.head() # used to check if the data has been imported correctly

Unnamed: 0,Brands,Models,Colors,Memory,Storage,Camera,Rating,Selling Price,Original Price,Mobile,Discount,discount percentage
0,SAMSUNG,GALAXY M31S,Mirage Black,8 GB,128 GB,Yes,4.3,19330,20999,SAMSUNG GALAXY M31S,1669,7.947998
1,Nokia,3.2,Steel,2 GB,16 GB,Yes,3.8,10199,10199,Nokia 3.2,0,0.0
2,realme,C2,Diamond Black,2 GB,,Yes,4.4,6999,7999,realme C2,1000,12.501563
3,Infinix,Note 5,Ice Blue,4 GB,64 GB,Yes,4.2,12999,12999,Infinix Note 5,0,0.0
4,Apple,iPhone 11,Black,4GB,64 GB,Yes,4.6,49900,49900,Apple iPhone 11,0,0.0


In [8]:
# Check the shape (rows, columns)
sales_df.shape

(3114, 12)

In [9]:
sales_df.tail()

Unnamed: 0,Brands,Models,Colors,Memory,Storage,Camera,Rating,Selling Price,Original Price,Mobile,Discount,discount percentage
3109,POCO,M4 Pro 5G,Cool Blue,6 GB,128 GB,Yes,4.4,16999,19999,POCO M4 Pro 5G,3000,15.00075
3110,Nokia,225,Black,,Expandable Upto 32 GB,Yes,3.6,3499,3499,Nokia 225,0,0.0
3111,Apple,iPhone SE,White,2 GB,128 GB,Yes,4.5,44900,44900,Apple iPhone SE,0,0.0
3112,Apple,iPhone 13 Pro,Gold,6 GB,128 GB,Yes,,119900,119900,Apple iPhone 13 Pro,0,0.0
3113,GIONEE,F9,Blue,3 GB,32 GB,Yes,4.2,7900,7900,GIONEE F9,0,0.0


In [10]:
sales_df.info() # Helps to see if there are any missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3114 entries, 0 to 3113
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Brands               3114 non-null   object 
 1   Models               3114 non-null   object 
 2   Colors               3114 non-null   object 
 3   Memory               3071 non-null   object 
 4   Storage              3075 non-null   object 
 5   Camera               3114 non-null   object 
 6   Rating               2970 non-null   float64
 7   Selling Price        3114 non-null   int64  
 8   Original Price       3114 non-null   int64  
 9   Mobile               3114 non-null   object 
 10  Discount             3114 non-null   int64  
 11  discount percentage  3114 non-null   float64
dtypes: float64(2), int64(3), object(7)
memory usage: 292.1+ KB


In [11]:
sales_df.describe() # Gives summary statistics of numerical columns

Unnamed: 0,Rating,Selling Price,Original Price,Discount,discount percentage
count,2970.0,3114.0,3114.0,3114.0,3114.0
mean,4.243098,26436.625562,28333.473025,1896.847463,6.086788
std,0.271991,30066.892622,31525.599889,5337.126176,11.106776
min,2.3,1000.0,1000.0,-8000.0,-160.320641
25%,4.1,9990.0,10030.25,0.0,0.0
50%,4.3,15000.0,16889.5,0.0,0.0
75%,4.4,28999.0,31500.0,2000.0,9.836388
max,5.0,179900.0,189999.0,75000.0,70.610305


In [12]:
sales_df.columns # Gives a list of column labels in the df
# Comes in handy to when you need to refer to the columns

Index(['Brands', 'Models', 'Colors', 'Memory', 'Storage', 'Camera', 'Rating',
       'Selling Price', 'Original Price', 'Mobile', 'Discount',
       'discount percentage'],
      dtype='object')

#### Accessing data using iloc and loc

In [13]:
# sales_df[0:2]

# Loc - accesses data using the column label
# iloc - accesses data using its position 

sales_df.iloc[0]

Brands                              SAMSUNG
Models                         GALAXY M31S 
Colors                         Mirage Black
Memory                                 8 GB
Storage                              128 GB
Camera                                  Yes
Rating                                  4.3
Selling Price                         19330
Original Price                        20999
Mobile                 SAMSUNG GALAXY M31S 
Discount                               1669
discount percentage                7.947998
Name: 0, dtype: object

In [14]:
# To access multiple rows
# slicing arr[start,stop,step]

sales_df.iloc[1000::100] # like slicing

Unnamed: 0,Brands,Models,Colors,Memory,Storage,Camera,Rating,Selling Price,Original Price,Mobile,Discount,discount percentage
1000,Lenovo,Vibe K5 Plus,Silver,2 GB,16 GB,Yes,4.0,8499,8499,Lenovo Vibe K5 Plus,0,0.0
1100,ASUS,ROG Phone II,Black,8 GB,128 GB,Yes,4.6,40999,40999,ASUS ROG Phone II,0,0.0
1200,vivo,Y83,Gold,4 GB,32 GB,Yes,4.4,15990,15990,vivo Y83,0,0.0
1300,POCO,M4 Pro,Power Black,6 GB,64 GB,Yes,4.4,14999,17999,POCO M4 Pro,3000,16.667593
1400,OPPO,A55,Starry Black,6 GB,128 GB,Yes,3.8,17490,20990,OPPO A55,3500,16.674607
1500,Google Pixel,3a,Just Black,4 GB,64 GB,Yes,4.5,39999,39999,Google Pixel 3a,0,0.0
1600,Apple,iPhone XS,Space Grey,4 GB,256 GB,Yes,4.7,76999,103900,Apple iPhone XS,26901,25.891242
1700,vivo,X70 Pro,Aurora Dawn,8 GB,128 GB,Yes,4.5,46990,51990,vivo X70 Pro,5000,9.617234
1800,vivo,S1,Skyline Blue,6 GB,64 GB,Yes,4.4,20990,20990,vivo S1,0,0.0
1900,SAMSUNG,Galaxy A20s,Black,4 GB,64 GB,Yes,4.3,14900,14900,SAMSUNG Galaxy A20s,0,0.0


In [15]:
sales_df.iloc[0:5, [0,2]]

Unnamed: 0,Brands,Colors
0,SAMSUNG,Mirage Black
1,Nokia,Steel
2,realme,Diamond Black
3,Infinix,Ice Blue
4,Apple,Black


In [16]:
# Accessing one row with iloc
sales_df.iloc[0]

Brands                              SAMSUNG
Models                         GALAXY M31S 
Colors                         Mirage Black
Memory                                 8 GB
Storage                              128 GB
Camera                                  Yes
Rating                                  4.3
Selling Price                         19330
Original Price                        20999
Mobile                 SAMSUNG GALAXY M31S 
Discount                               1669
discount percentage                7.947998
Name: 0, dtype: object

In [17]:
# Accessing multiple rows
sales_df.iloc[[0,6,10]]

Unnamed: 0,Brands,Models,Colors,Memory,Storage,Camera,Rating,Selling Price,Original Price,Mobile,Discount,discount percentage
0,SAMSUNG,GALAXY M31S,Mirage Black,8 GB,128 GB,Yes,4.3,19330,20999,SAMSUNG GALAXY M31S,1669,7.947998
6,Apple,iPhone 13 Mini,Pink,6 GB,512 GB,Yes,,99900,99900,Apple iPhone 13 Mini,0,0.0
10,SAMSUNG,Galaxy A12,Black,4 GB,64 GB,Yes,4.2,11989,11989,SAMSUNG Galaxy A12,0,0.0


In [18]:
# Accessing one column
sales_df.iloc[:, 3]

0       8 GB
1       2 GB
2       2 GB
3       4 GB
4        4GB
        ... 
3109    6 GB
3110     NaN
3111    2 GB
3112    6 GB
3113    3 GB
Name: Memory, Length: 3114, dtype: object

In [19]:
# Accessing multiple columns
sales_df.iloc[:, [0,3,7]]

Unnamed: 0,Brands,Memory,Selling Price
0,SAMSUNG,8 GB,19330
1,Nokia,2 GB,10199
2,realme,2 GB,6999
3,Infinix,4 GB,12999
4,Apple,4GB,49900
...,...,...,...
3109,POCO,6 GB,16999
3110,Nokia,,3499
3111,Apple,2 GB,44900
3112,Apple,6 GB,119900


In [20]:
# Output the first 10 rows (Brand, Selling Price and Original Price)
sales_df.iloc[0:10, [0,7,8]]

Unnamed: 0,Brands,Selling Price,Original Price
0,SAMSUNG,19330,20999
1,Nokia,10199,10199
2,realme,6999,7999
3,Infinix,12999,12999
4,Apple,49900,49900
5,GIONEE,2199,2199
6,Apple,99900,99900
7,Apple,42999,47900
8,SAMSUNG,20400,20400
9,Xiaomi,21736,22999


In [21]:
# Details model and the specs
sales_df.iloc[:, 1:6]

Unnamed: 0,Models,Colors,Memory,Storage,Camera
0,GALAXY M31S,Mirage Black,8 GB,128 GB,Yes
1,3.2,Steel,2 GB,16 GB,Yes
2,C2,Diamond Black,2 GB,,Yes
3,Note 5,Ice Blue,4 GB,64 GB,Yes
4,iPhone 11,Black,4GB,64 GB,Yes
...,...,...,...,...,...
3109,M4 Pro 5G,Cool Blue,6 GB,128 GB,Yes
3110,225,Black,,Expandable Upto 32 GB,Yes
3111,iPhone SE,White,2 GB,128 GB,Yes
3112,iPhone 13 Pro,Gold,6 GB,128 GB,Yes


#### Loc

In [22]:
# Accesing one row with loc
# sales_df.loc[0]

sales_df.loc[0, "Brands"]

'SAMSUNG'

In [23]:
data = {
    'Name': ['Stanley', 'Ruth', 'Berlin', 'Esther'],
    'Graded': [20, 30, 40, 50],
    'Subject': ['English', 'Math', 'Geography', 'Computer Science']
}
df = pd.DataFrame(data, index=['A', 'B', 'C', 'D'])
df

Unnamed: 0,Name,Graded,Subject
A,Stanley,20,English
B,Ruth,30,Math
C,Berlin,40,Geography
D,Esther,50,Computer Science


In [24]:
# Using loc on indexes that are not numerical
# If accessing multiple rows, pass a list of the rows you want to access
df.loc[['C', 'A']]

Unnamed: 0,Name,Graded,Subject
C,Berlin,40,Geography
A,Stanley,20,English


In [25]:
# Accessing one/multiple columns using loc

df.loc[:, 'Graded']

A    20
B    30
C    40
D    50
Name: Graded, dtype: int64

In [26]:
df.loc[['C','D'], ['Graded', 'Name']]

Unnamed: 0,Graded,Name
C,40,Berlin
D,50,Esther


In [27]:
# Grades and subject of Ruth and Esther

df.loc[['B', 'D'], ['Graded','Subject']]

Unnamed: 0,Graded,Subject
B,30,Math
D,50,Computer Science
