In [62]:
import numpy as np
import pandas as pd

In [61]:
# remove the warning
import warnings
warnings.filterwarnings('ignore')
# this is supposed to be used at the top of the page to not to see any warnings

## <font color='red'> <b>DataFrames</b><font color='black'>

- DataFrames are like matrix, 2D
- it shows your data as table format
- it has its default indexes, labled indexes, clumn names
- each column is a pandas series

## <font color='green'> <b>Creating a DataFrame</b><font color='black'>

- DataFrame is a two-dimensional collection of data.

- It is a data structure in which data is stored in tabular form (taboos).

- Data sets are organized in rows and columns; We can store multiple datasets in a dataframe.

- We can think of a DataFrame as a series of Series objects grouped together to share the same index.

- We can perform various arithmetic operations such as selecting column/row and adding column/row to the data frame.

- We can import DataFrames from external storage; SQL Database, CSV file and an Excel file.

### <font color='green'> <b>Creating a DataFrame Using the Lists of Data & Columns</b><font color='black'>

In [4]:
list1 = [[1,2,3], [4,5,6]]
col_name = ['yellow', 'brown', 'green']
df = pd.DataFrame(data = list1)
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [5]:
# I want to use my own column name
df = pd.DataFrame(data = list1, columns=col_name)
df

Unnamed: 0,yellow,brown,green
0,1,2,3
1,4,5,6


In [6]:
# assign the values form row/index name
col_name = ['yellow', 'brown']
df = pd.DataFrame(data = list1, index=col_name)
df


Unnamed: 0,0,1,2
yellow,1,2,3
brown,4,5,6


### <font color='green'> <b>Creating a DataFrame Using a Numpy Arrays</b><font color='black'>

In [7]:
ar1 = np.arange(1, 125, 5).reshape(5,5)
ar1
# 2D Matrix

array([[  1,   6,  11,  16,  21],
       [ 26,  31,  36,  41,  46],
       [ 51,  56,  61,  66,  71],
       [ 76,  81,  86,  91,  96],
       [101, 106, 111, 116, 121]])

In [8]:
ar1.ndim

2

In [9]:
pd.DataFrame(ar1)

Unnamed: 0,0,1,2,3,4
0,1,6,11,16,21
1,26,31,36,41,46
2,51,56,61,66,71
3,76,81,86,91,96
4,101,106,111,116,121


In [10]:
pd.DataFrame(data = ar1)
# specify the data, you dont need to specify
# it just makes it easire to read an understand

Unnamed: 0,0,1,2,3,4
0,1,6,11,16,21
1,26,31,36,41,46
2,51,56,61,66,71
3,76,81,86,91,96
4,101,106,111,116,121


In [11]:
pd.DataFrame(data = ar1, columns=['col1', 'col2', 'col3', 'col4', 'col5'])

Unnamed: 0,col1,col2,col3,col4,col5
0,1,6,11,16,21
1,26,31,36,41,46
2,51,56,61,66,71
3,76,81,86,91,96
4,101,106,111,116,121


In [12]:
pd.DataFrame(ar1, ['col1', 'col2', 'col3', 'col4', 'col5'])
# second position is row/index so it will asume that you typed the row value

Unnamed: 0,0,1,2,3,4
col1,1,6,11,16,21
col2,26,31,36,41,46
col3,51,56,61,66,71
col4,76,81,86,91,96
col5,101,106,111,116,121


In [13]:
pd.DataFrame(ar1, ['row1', 'row2', 'row3', 'row4', 'row5'], ['col1', 'col2', 'col3', 'col4', 'col5'])

Unnamed: 0,col1,col2,col3,col4,col5
row1,1,6,11,16,21
row2,26,31,36,41,46
row3,51,56,61,66,71
row4,76,81,86,91,96
row5,101,106,111,116,121


In [14]:
pd.DataFrame(ar1, columns=['col1', 'col2', 'col3', 'col4', 'col5'], index=['row1', 'row2', 'row3', 'row4', 'row5'])
# when we use the params names, its position does not matter

Unnamed: 0,col1,col2,col3,col4,col5
row1,1,6,11,16,21
row2,26,31,36,41,46
row3,51,56,61,66,71
row4,76,81,86,91,96
row5,101,106,111,116,121


### <font color='green'> <b>Creating a DataFrame Using a Dictionary</b><font color='black'>

In [21]:
data1 = {'name':['vusal', 'dila', 'royal'], 'age':[30, 31, 1]}
data1

{'name': ['vusal', 'dila', 'royal'], 'age': [30, 31, 1]}

In [22]:
pd.Series(data1)

name    [vusal, dila, royal]
age              [30, 31, 1]
dtype: object

In [24]:
pd.DataFrame(data=data1)
# left data is param, right data is python dict

Unnamed: 0,name,age
0,vusal,30
1,dila,31
2,royal,1


## <font color='green'> <b>Basic Attributes & Methods of DataFrames</b><font color='black'>

In [31]:
df_1 = pd.DataFrame(data1, columns=['name', 'age', 'salary'])
df_1
# salary column name does not exist
# since pandas cannot find the column name in dictionary, it will assign a null value

Unnamed: 0,name,age,salary
0,vusal,30,
1,dila,31,
2,royal,1,


In [29]:
pd.DataFrame(data1, columns=['job,' 'salary'])
# if you enter column names that do not exists, it will return empty table, without any values

Unnamed: 0,"job,salary"


In [33]:
df_1.shape

(3, 3)

In [35]:
df_1.size
# returns the number of values exists in def_1

9

## <font color='green'> <b>Indexing, Slicing & Selection</b><font color='black'>

In [36]:
data1 = {'name':['vusal', 'dila', 'royal'], 'age':[30, 31, 1]}
data1

{'name': ['vusal', 'dila', 'royal'], 'age': [30, 31, 1]}

In [37]:
df_2 = pd.DataFrame(data1)
df_2

Unnamed: 0,name,age
0,vusal,30
1,dila,31
2,royal,1


In [40]:
df_2.head(2)
# first 2 row

Unnamed: 0,name,age
0,vusal,30
1,dila,31


In [42]:
df_2.tail(2)
# last 2 row

Unnamed: 0,name,age
1,dila,31
2,royal,1


In [44]:
df_2.sample()
# returning sample 1 data

Unnamed: 0,name,age
0,vusal,30


In [45]:
df_2.sample(2)
# returning sample 2 data

Unnamed: 0,name,age
0,vusal,30
2,royal,1


In [47]:
df_2.columns

Index(['name', 'age'], dtype='object')

In [49]:
df_2.columns[1]

'age'

In [50]:
df_2.index

RangeIndex(start=0, stop=3, step=1)

In [51]:
df_2

Unnamed: 0,name,age
0,vusal,30
1,dila,31
2,royal,1


In [53]:
df_2.age.mean()
# returns arithmetic avarage -> mean

np.float64(20.666666666666668)

In [64]:
data_2 = {'name':['Jennifer', 'Jasmine', 'Kaye', 'Clark'], 'height':[130, 160, 180, 190], 'weight':[50, 68, 70, 75]}
data_2

{'name': ['Jennifer', 'Jasmine', 'Kaye', 'Clark'],
 'height': [130, 160, 180, 190],
 'weight': [50, 68, 70, 75]}

In [65]:
df_3 = pd.DataFrame(data_2)
df_3

Unnamed: 0,name,height,weight
0,Jennifer,130,50
1,Jasmine,160,68
2,Kaye,180,70
3,Clark,190,75


In [66]:
df_3 = pd.DataFrame(data_2, index=['student_01', 'student_02', 'student_03', 'student_04'])
df_3

Unnamed: 0,name,height,weight
student_01,Jennifer,130,50
student_02,Jasmine,160,68
student_03,Kaye,180,70
student_04,Clark,190,75


In [67]:
df_3.shape

(4, 3)

In [68]:
df_3.name

student_01    Jennifer
student_02     Jasmine
student_03        Kaye
student_04       Clark
Name: name, dtype: object

In [69]:
df_3['name']

student_01    Jennifer
student_02     Jasmine
student_03        Kaye
student_04       Clark
Name: name, dtype: object

In [71]:
df_3.columns
# see the column name

Index(['name', 'height', 'weight'], dtype='object')

In [72]:
df_3.height

student_01    130
student_02    160
student_03    180
student_04    190
Name: height, dtype: int64

In [83]:
srsx = df_3.weight
srsx

student_01    50
student_02    68
student_03    70
student_04    75
Name: weight, dtype: int64

In [84]:
type(srsx)

pandas.core.series.Series

In [86]:
df_3[['height', 'weight']]
# it is possible to get multiple collumns at the same time

Unnamed: 0,height,weight
student_01,130,50
student_02,160,68
student_03,180,70
student_04,190,75


In [87]:
df_3['student01']
# this format can be used when you read a column, not a row

KeyError: 'student01'

In [89]:
df_3['student_01':'student_03']
# this is slicing, in slicing when we use labled index, the ending index is also included

Unnamed: 0,name,height,weight
student_01,Jennifer,130,50
student_02,Jasmine,160,68
student_03,Kaye,180,70


In [90]:
df_3[0:2]
# when we use default index, dending is exluded

Unnamed: 0,name,height,weight
student_01,Jennifer,130,50
student_02,Jasmine,160,68


## <font color='green'> <b>Creating a New Column</b><font color='black'>

In [93]:
# BMI = kg/m^2
df_3['BMI'] = df_3.weight / (df_3.height / 100) **2
df_3
# we use

Unnamed: 0,name,height,weight,BMI
student_01,Jennifer,130,50,29.585799
student_02,Jasmine,160,68,26.5625
student_03,Kaye,180,70,21.604938
student_04,Clark,190,75,20.775623


In [95]:
# we can round the float number by using round function
df_3['BMI'] = round(df_3.weight / (df_3.height / 100) **2, 2)
df_3

Unnamed: 0,name,height,weight,BMI
student_01,Jennifer,130,50,29.59
student_02,Jasmine,160,68,26.56
student_03,Kaye,180,70,21.6
student_04,Clark,190,75,20.78


In [96]:
df_3['clm'] = np.arange(4)
df_3

Unnamed: 0,name,height,weight,BMI,clm
student_01,Jennifer,130,50,29.59,0
student_02,Jasmine,160,68,26.56,1
student_03,Kaye,180,70,21.6,2
student_04,Clark,190,75,20.78,3


In [97]:
# you need to have the file inside the same folder that you project is
df_csv = pd.read_csv('adult_eda.csv')
df_csv

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9.0,Never-married,Adm-clerical,,White,Male,0,0,20,United-States,<=50K


In [99]:
df_csv.head()
# get the first 5 row

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [100]:
df_csv.tail()
# last 5 data row

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32556,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9.0,Never-married,Adm-clerical,,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [101]:
df_csv.sample(6)
# returns sample 6 rows

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
23542,29,State-gov,199450,Some-college,10.0,Divorced,Adm-clerical,Unmarried,Black,Male,0,0,40,United-States,<=50K
30016,31,State-gov,440129,Some-college,10.0,Divorced,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,>50K
27612,37,Private,336880,Some-college,10.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,<=50K
23647,42,Federal-gov,70240,Some-college,10.0,Divorced,Exec-managerial,Unmarried,Asian-Pac-Islander,Female,0,0,40,United-States,<=50K
30323,21,State-gov,110946,HS-grad,9.0,Never-married,Adm-clerical,,White,Female,0,0,43,United-States,<=50K
29386,25,Private,178037,HS-grad,9.0,Never-married,Sales,Unmarried,White,Male,0,0,40,United-States,<=50K


In [102]:
df_csv.dtypes
# data types of  each series

age                 int64
workclass          object
fnlwgt              int64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
salary             object
dtype: object

In [104]:
df_csv.size
# returns the total values
# it is equals rows * columns

488415

In [106]:
df_csv.shape

(32561, 15)

In [107]:
df_csv.ndim
# check the dimension

2

In [109]:
df_csv.isnull()
# check if the data has null values
# False -> there is not a null value

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
32557,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
32558,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
32559,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False


In [110]:
# find the total Null values on each column
df_csv.isnull().sum()

age                  0
workclass            0
fnlwgt               0
education            0
education-num      802
marital-status       0
occupation           0
relationship      5068
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country       0
salary               0
dtype: int64

In [111]:
df_csv.info()
# gives information abou the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  int64  
 1   workclass       32561 non-null  object 
 2   fnlwgt          32561 non-null  int64  
 3   education       32561 non-null  object 
 4   education-num   31759 non-null  float64
 5   marital-status  32561 non-null  object 
 6   occupation      32561 non-null  object 
 7   relationship    27493 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital-gain    32561 non-null  int64  
 11  capital-loss    32561 non-null  int64  
 12  hours-per-week  32561 non-null  int64  
 13  native-country  32561 non-null  object 
 14  salary          32561 non-null  object 
dtypes: float64(1), int64(5), object(9)
memory usage: 3.7+ MB


In [112]:
df_csv.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,31759.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.082843,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.576172,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [118]:
# change the column to row, row to column
df_csv.describe().transpose()
# you can type it as following
df_csv.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education-num,31759.0,10.082843,2.576172,1.0,9.0,10.0,12.0,16.0
capital-gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
capital-loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
hours-per-week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0


## <font color='green'> <b>Removing Columns & Rows</b><font color='black'>

The drop() method is used to remove the specified row or column from a Pandas DataFrame.

If the column is to be removed, the axis='columns' parameter is used and the specified column is removed.

Similarly, if a row is to be removed, the axis='index' parameter is used and the specified row is removed.

This method is frequently used to remove unwanted data from DataFrame and perform data manipulation.

In [119]:
df_3

Unnamed: 0,name,height,weight,BMI,clm
student_01,Jennifer,130,50,29.59,0
student_02,Jasmine,160,68,26.56,1
student_03,Kaye,180,70,21.6,2
student_04,Clark,190,75,20.78,3


In [124]:
df_3.drop('clm')
# drop function has axis parameter, as default it is 0
# row name 'new' does not exist

KeyError: "['clm'] not found in axis"

In [121]:
df_3

Unnamed: 0,name,height,weight,BMI,clm
student_01,Jennifer,130,50,29.59,0
student_02,Jasmine,160,68,26.56,1
student_03,Kaye,180,70,21.6,2
student_04,Clark,190,75,20.78,3


In [122]:
# drop the 'clm'
df_3.drop('clm', axis=1)

Unnamed: 0,name,height,weight,BMI
student_01,Jennifer,130,50,29.59
student_02,Jasmine,160,68,26.56
student_03,Kaye,180,70,21.6
student_04,Clark,190,75,20.78


In [123]:
df_3
# when you try to dee=lete a data from your DataFrame, it does not delete without permission
# using inplace = True, we can make the change perminent

Unnamed: 0,name,height,weight,BMI,clm
student_01,Jennifer,130,50,29.59,0
student_02,Jasmine,160,68,26.56,1
student_03,Kaye,180,70,21.6,2
student_04,Clark,190,75,20.78,3


In [125]:
# delete the column perminenlt by adding inplace=True
df_3.drop('clm', axis=1, inplace=True)
df_3

Unnamed: 0,name,height,weight,BMI
student_01,Jennifer,130,50,29.59
student_02,Jasmine,160,68,26.56
student_03,Kaye,180,70,21.6
student_04,Clark,190,75,20.78


In [126]:
# delete the row perminenly
df_3.drop('student_04', inplace=True)
df_3

Unnamed: 0,name,height,weight,BMI
student_01,Jennifer,130,50,29.59
student_02,Jasmine,160,68,26.56
student_03,Kaye,180,70,21.6


In [127]:
# delete multiple columns
df_3.drop(['BMI', 'weight'], axis=1)

Unnamed: 0,name,height
student_01,Jennifer,130
student_02,Jasmine,160
student_03,Kaye,180


## <font color='green'> <b>Selecting Rows and Columns using .loc[ ] and iloc[ ]</b><font color='black'>

![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

loc:label-based

iloc: integer position-based

loc is short for "location" and allows accessing data by row and column names.

loc is used to access data at a specific location using row and column labels.

iloc is short for "integer location" and allows accessing data by row and column numbers.

iloc is used to access data at a specific location using row and column numbers.

1- Tag or index independence: While traditional indexing methods are affected by changes in indexes, "loc" and "iloc" methods make selections according to their locations. Therefore, changes in the indexes in the data set do not cause any problems in selections made with the "loc" and "iloc" methods.

2-Fast performance: The "iloc" function selects data using only row and column indexes, so it works faster than the loc function. This feature is useful in large data sets or performance-critical applications.

3-Flexibility: Traditional indexing methods only allow indexing using consecutive integers starting from zero. The "loc" and "iloc" methods are based on a tag or index values and provide a more flexible selection. The "loc" and "iloc" functions allow the combining of rows and columns in the data frame. For example, you can use both row and column labels or indexes to select data in a particular row in a particular column. This flexibility is useful in data processing operations.

4-More readable code: "loc" and "iloc" functions help make the code more readable.

In [4]:
np.random.seed(45)

data = np.random.randint(1, 100, 20).reshape(5, 4)
data

array([[76, 31,  4, 33],
       [96, 62, 86, 36],
       [69, 16, 66, 15],
       [54, 58, 73, 88],
       [47,  9, 54, 13]])

In [6]:
df_01 = pd.DataFrame(data, columns=['col1', 'col2', 'col3', 'col4'], index=range(101, 106))
df_01

Unnamed: 0,col1,col2,col3,col4
101,76,31,4,33
102,96,62,86,36
103,69,16,66,15
104,54,58,73,88
105,47,9,54,13


In [7]:
# select 1 row of data by label
df_01.loc[102]

col1    96
col2    62
col3    86
col4    36
Name: 102, dtype: int64

In [8]:
# select 1 row of data by original index
df_01.iloc[3]

col1    54
col2    58
col3    73
col4    88
Name: 104, dtype: int64

In [10]:
# iloc is used with default index
# you cannot use it by label
df_01.iloc[102]
# it will look at deault index and will not find 102 index

IndexError: single positional indexer is out-of-bounds

In [11]:
df_01

Unnamed: 0,col1,col2,col3,col4
101,76,31,4,33
102,96,62,86,36
103,69,16,66,15
104,54,58,73,88
105,47,9,54,13


In [13]:
df_01.loc[102:104]
# because of loc, 104 is included

Unnamed: 0,col1,col2,col3,col4
102,96,62,86,36
103,69,16,66,15
104,54,58,73,88


In [16]:
df_01.iloc[1:4]
# because of iloc, 4 is not included

Unnamed: 0,col1,col2,col3,col4
102,96,62,86,36
103,69,16,66,15
104,54,58,73,88


In [17]:
df_01.iloc[1:4:2]
# jumb by 2

Unnamed: 0,col1,col2,col3,col4
102,96,62,86,36
104,54,58,73,88


In [18]:
df_01.index

RangeIndex(start=101, stop=106, step=1)

In [20]:
'a b c d e'.split()
# using split function, we can split string by empty string inbetween them

['a', 'b', 'c', 'd', 'e']

In [22]:
df_01.index = 'a b c d e'.split()
df_01
# we can use the split function and assign the indexes

Unnamed: 0,col1,col2,col3,col4
a,76,31,4,33
b,96,62,86,36
c,69,16,66,15
d,54,58,73,88
e,47,9,54,13


In [23]:
df_01.index = ['A', 'B', 'C', 'D', 'E']
df_01
# we can change the index this way as well

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88
E,47,9,54,13


In [24]:
df_01.loc['B':'D']
# in label index location, or loc, ths stop is included

Unnamed: 0,col1,col2,col3,col4
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88


In [25]:
df_01.iloc[1:4]
# in iloc technuqie, where we are slicing default indexes, the stop is not included

Unnamed: 0,col1,col2,col3,col4
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88


In [26]:
df_01.loc[::2]

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
C,69,16,66,15
E,47,9,54,13


In [27]:
df_01.loc[['A', 'D']]
# selec the multiple specific rows

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
D,54,58,73,88


- default syntax of loc and iloc is as: **df.lock[xx, yy]**
    - xx represents the index or range of index or list of index for **row**
    - yy represents the index or range of index or list of index for **columns**

In [29]:
df_01

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88
E,47,9,54,13


In [30]:
df_01.loc[:,"col3"]
# first part before "," means select all
# second part after "," means select column

A     4
B    86
C    66
D    73
E    54
Name: col3, dtype: int64

In [32]:
df_01.iloc[:,2]

A     4
B    86
C    66
D    73
E    54
Name: col3, dtype: int64

In [38]:
# select 4, 66, and 54 from col3
df_01.iloc[::2,2]
df_01.loc[::2,'col3']

A     4
C    66
E    54
Name: col3, dtype: int64

In [44]:
# select 4 and 73 from col3
df_01.iloc[[0,3],2]

A     4
D    73
Name: col3, dtype: int64

In [45]:
df_01

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88
E,47,9,54,13


In [47]:
# select all the data from col2 to col4
df_01.loc[:,'col2':]

Unnamed: 0,col2,col3,col4
A,31,4,33
B,62,86,36
C,16,66,15
D,58,73,88
E,9,54,13


In [52]:
# select 62, 36 and 58, 88
df_01.loc[['B','D'],['col2','col4']]

Unnamed: 0,col2,col4
B,62,36
D,58,88


In [51]:
# select 62, 36 and 9,13
df_01.loc['B'::3,'col2'::2]

Unnamed: 0,col2,col4
B,62,36
E,9,13


In [53]:
df_01

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88
E,47,9,54,13


In [54]:
df_01.loc['A':'D','col2']

A    31
B    62
C    16
D    58
Name: col2, dtype: int64

In [55]:
df_01.loc['A':'D',['col2']]
# intead of giving as series, its giving us a dataframe

Unnamed: 0,col2
A,31
B,62
C,16
D,58


In [57]:
df_01['col2']
# when column selection if one square bracket[], then it will return the result as a series

A    31
B    62
C    16
D    58
E     9
Name: col2, dtype: int64

In [58]:
df_01[['col2']]
# when column selection if two square bracket[[]], then it will return the result as a dataframe

Unnamed: 0,col2
A,31
B,62
C,16
D,58
E,9


In [59]:
df_01

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88
E,47,9,54,13


In [60]:
df_01.iloc[:2,3]

A    33
B    36
Name: col4, dtype: int64

In [64]:
df_01.loc['B':'D','col3':'col4']

Unnamed: 0,col3,col4
B,86,36
C,66,15
D,73,88


In [65]:
df_01.loc['B':'D',['col3','col4']]

Unnamed: 0,col3,col4
B,86,36
C,66,15
D,73,88


In [66]:
df_01.loc[::2,'col1':'col3']
# this will select starting from col1 until col3

Unnamed: 0,col1,col2,col3
A,76,31,4
C,69,16,66
E,47,9,54


In [67]:
df_01.loc[::2,['col1','col3']]
# this will select only col1 and col3

Unnamed: 0,col1,col3
A,76,4
C,69,66
E,47,54


In [69]:
df_01

Unnamed: 0,col1,col2,col3,col4
A,76,31,4,33
B,96,62,86,36
C,69,16,66,15
D,54,58,73,88
E,47,9,54,13


In [68]:
df_01.loc[::2][['col1','col3']]
# another was of selecting row and column with loc and iclos syntex is as:
# df.loc[row_Selection][column_selection]

Unnamed: 0,col1,col3
A,76,4
C,69,66
E,47,54


In [70]:
df_01.loc[::2][['col1':'col3':2]]
# do not forget that in this technique, we cannot use slicing operation to select columns

SyntaxError: invalid syntax (462153864.py, line 1)

In [72]:
df_01.loc[::2]['col1']
# second quare bracket [] makes sure to give the result as a dataframe
# if you use singel sqaure [], it will give you result as series

A    76
C    69
E    47
Name: col1, dtype: int64

## <font color='green'> <b>Conditional Selection</b><font color='black'>

In [100]:
np.random.seed(42)
data_4 = np.random.randint(1, 100, 20).reshape(5,4)
df_4 = pd.DataFrame(data_4, columns=['col1', 'col2', 'col3', 'col4'], index='a b c d e'.split())
df_4

Unnamed: 0,col1,col2,col3,col4
a,52,93,15,72
b,61,21,83,87
c,75,75,88,24
d,3,22,53,2
e,88,30,38,2


In [76]:
df_4 > 35

Unnamed: 0,col1,col2,col3,col4
a,True,True,False,True
b,True,False,True,True
c,True,True,True,False
d,False,False,True,False
e,True,False,True,False


In [77]:
df_4[df_4>35]

Unnamed: 0,col1,col2,col3,col4
a,52.0,93.0,,72.0
b,61.0,,83.0,87.0
c,75.0,75.0,88.0,
d,,,53.0,
e,88.0,,38.0,


In [78]:
df_4

Unnamed: 0,col1,col2,col3,col4
a,52,93,15,72
b,61,21,83,87
c,75,75,88,24
d,3,22,53,2
e,88,30,38,2


In [82]:
df_4[df_4['col4']>30]
# it will select row a and b because 72 and 87 is > 30
# because we did not specify the row, it will return the entire row

Unnamed: 0,col1,col2,col3,col4
a,52,93,15,72
b,61,21,83,87


In [84]:
df_4.col4[df_4['col4']>30]
# here is specify the select data only from col4

a    72
b    87
Name: col4, dtype: int64

In [85]:
df_4[['col4']][df_4['col4']>30]
# the result will be dataframe

Unnamed: 0,col4
a,72
b,87


## <font color='green'> <b>reset_index() & set_index()</b><font color='black'>

- reset_index() resets the DataFrame index and uses the default state instead.

- set_index() sets a specified column or columns of the DataFrame as a new index of the DataFrame.

In [97]:
df_4

Unnamed: 0_level_0,col2,col3,col4
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52,93,15,72
61,21,83,87
75,75,88,24
3,22,53,2
88,30,38,2


In [101]:
# setting the column name as index
df_4.set_index('col1') # not perminent

# this might be useful we want to set one data as index
# ex: to know the temperature, dumidity, etx for a specific 'date', we need to set 'date' as index

Unnamed: 0_level_0,col2,col3,col4
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52,93,15,72
61,21,83,87
75,75,88,24
3,22,53,2
88,30,38,2


In [102]:
df_4.set_index("col1", inplace=True) # change is perminant
df_4

Unnamed: 0_level_0,col2,col3,col4
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52,93,15,72
61,21,83,87
75,75,88,24
3,22,53,2
88,30,38,2


In [103]:
df_4.reset_index() # not perminant
# reset the index back to original value

Unnamed: 0,col1,col2,col3,col4
0,52,93,15,72
1,61,21,83,87
2,75,75,88,24
3,3,22,53,2
4,88,30,38,2


In [104]:
df_4.reset_index(inplace=True) # perminant
df_4

Unnamed: 0,col1,col2,col3,col4
0,52,93,15,72
1,61,21,83,87
2,75,75,88,24
3,3,22,53,2
4,88,30,38,2


In [105]:
df_4.set_index('col3', inplace=True)
df_4

Unnamed: 0_level_0,col1,col2,col4
col3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15,52,93,72
83,61,21,87
88,75,75,24
53,3,22,2
38,88,30,2


In [107]:
df_4.reset_index(drop=True, inplace=True)
df_4
# by dafult drop=False, it will not drop the column entirely while resetting the index back
# if we say drop=True, it will drop the column entirely  while resetting the index
# col3 does not exist

Unnamed: 0,col1,col2,col4
0,52,93,72
1,61,21,87
2,75,75,24
3,3,22,2
4,88,30,2


## <font color='green'> <b>isna() & fillna() & dropna()</b><font color='black'>

In [109]:
df_csv_01 = pd.read_csv('adult_eda.csv')
df_csv_01

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9.0,Never-married,Adm-clerical,,White,Male,0,0,20,United-States,<=50K


In [110]:
df_csv_01.isna()
# checks for empty data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
32557,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
32558,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
32559,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False


In [111]:
df_csv_01.isna().sum()

age                  0
workclass            0
fnlwgt               0
education            0
education-num      802
marital-status       0
occupation           0
relationship      5068
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country       0
salary               0
dtype: int64

In [112]:
df_csv_01.relationship

0        Not-in-family
1              Husband
2        Not-in-family
3              Husband
4                 Wife
             ...      
32556             Wife
32557          Husband
32558        Unmarried
32559              NaN
32560             Wife
Name: relationship, Length: 32561, dtype: object

In [113]:
df_csv_01.relationship.dropna()
# removing or droppding rows containing null values in relationship column
# length of relationship column was 32561, 5068 of them were null values, 
# after dropping the null values, it will have 27493 data

0        Not-in-family
1              Husband
2        Not-in-family
3              Husband
4                 Wife
             ...      
32555    Not-in-family
32556             Wife
32557          Husband
32558        Unmarried
32560             Wife
Name: relationship, Length: 27493, dtype: object

In [114]:
# make the change perminant
df_csv_01.relationship.dropna(inplace=True)
df_csv_01.relationship
# even after adding inplace=True, changes are not perminant
# because there are other columns, data cannot be empty, it will fill it with null values

0        Not-in-family
1              Husband
2        Not-in-family
3              Husband
4                 Wife
             ...      
32556             Wife
32557          Husband
32558        Unmarried
32559              NaN
32560             Wife
Name: relationship, Length: 32561, dtype: object

In [115]:
df_csv_01.dropna(inplace=True)
df_csv_01
# when we drop null rows from the entire data frame, the changes are perminant
# otherwise if you are dropping from a single column, after dropping the null rows, as the column rejoins ewith the rest of dataframe,
# it will set the same rows back to null

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,22,Private,310152,Some-college,10.0,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32556,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


- we do not drop the null values
- data is very valuable, we do not drop the data

In [116]:
df_csv_01.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [118]:
df_csv_01 = pd.read_csv('adult_eda.csv')
df_csv_01

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9.0,Never-married,Adm-clerical,,White,Male,0,0,20,United-States,<=50K


In [119]:
df_csv_01.isna().sum()

age                  0
workclass            0
fnlwgt               0
education            0
education-num      802
marital-status       0
occupation           0
relationship      5068
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country       0
salary               0
dtype: int64

In [124]:
# filling null values with Husband
df_csv_01.relationship.fillna('Husband', inplace=True)
# this is just an example, we cannot fill the null values with any random value
# we have to fillow a policy or a strategy to do it

In [125]:
df_csv_01.isna().sum()

age                 0
workclass           0
fnlwgt              0
education           0
education-num     802
marital-status      0
occupation          0
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country      0
salary              0
dtype: int64