# Pandas Practices

In [232]:
# Import pandas and numpy
import numpy as np
import pandas as pd

In [233]:
# Create Series
series = pd.Series([3, 53, 22, 52, 43])
print(series)

0     3
1    53
2    22
3    52
4    43
dtype: int64


In [234]:
# Check the values of the series
series.values

array([ 3, 53, 22, 52, 43])

In [235]:
# Check datatype of the series
series.dtype

dtype('int64')

In [236]:
# Check the indexing of the series
series.index

RangeIndex(start=0, stop=5, step=1)

In [237]:
# Check the heading of series
series.name

In [238]:
# Assign the heading of series
series.name = "Numbers"
print(series)

0     3
1    53
2    22
3    52
4    43
Name: Numbers, dtype: int64


In [239]:
# Indexing 
series[0] # Get the first element
series[len(series) - 1] # Get the last element
series[0:3] # Get the range of the element

0     3
1    53
2    22
Name: Numbers, dtype: int64

In [240]:
# Indexing with iloc function
series.iloc[3] # Get specific element
series.iloc[[1, 2, 4]] # Get specifi range of element

1    53
2    22
4    43
Name: Numbers, dtype: int64

In [241]:
# Assign the name to the index of series
index = ["Apple", "Banana", "Mango", "Orange", "Grape"]
series.index = index
series.name = "Calories"
series

Apple      3
Banana    53
Mango     22
Orange    52
Grape     43
Name: Calories, dtype: int64

In [242]:
# Access the series with name index
series["Mango"]

np.int64(22)

In [243]:
# Accessing label based indexing with loc funtion
series.loc["Mango"] # Get specific element
series.loc[["Mango", "Grape", "Banana"]] # Get specific range of element

Mango     22
Grape     43
Banana    53
Name: Calories, dtype: int64

In [244]:
# Create Series with the help of dictionary
protien = {
    "Beef": 39,
    "Chicken": 26,
    "Egg": 6,
    "Rice": 1
}

series2 = pd.Series(protien, name="Protien per 100gm")
series2

Beef       39
Chicken    26
Egg         6
Rice        1
Name: Protien per 100gm, dtype: int64

In [245]:
# Conditional Selection
series2[series2 > 6]

Beef       39
Chicken    26
Name: Protien per 100gm, dtype: int64

In [246]:
# Logical and, or, and not operation
series2[(series2 > 6) & (series2 < 39)]
series2[(series2 > 6) | (series2 < 20)]

Beef       39
Chicken    26
Egg         6
Rice        1
Name: Protien per 100gm, dtype: int64

In [247]:
# Modify the element of series
series2["Beef"] = 40
series2

Beef       40
Chicken    26
Egg         6
Rice        1
Name: Protien per 100gm, dtype: int64

In [248]:
# exercise
ser = pd.Series(['a', np.nan, 1, np.nan, 2])
ser.notnull().sum()

np.int64(3)

In [249]:
# Create pandas dataframe
data = {
    "Name": ["Syful", "Taifur", "Rohan", "Arman", "Sufian", "Arifa", "Taifur"],
    "Age": [25, 24, 25, np.nan, 27, 21, 24],
    "Department": ["IT", "Marketing", "HR", "IT", "HR", "HR", "Marketing"],
    "Salary": [60000, 45000, 40000, 65000, np.nan, 50000, 45000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,60000.0
1,Taifur,24.0,Marketing,45000.0
2,Rohan,25.0,HR,40000.0
3,Arman,,IT,65000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,50000.0
6,Taifur,24.0,Marketing,45000.0


In [250]:
# Access the first three rows of data
df.head(3)

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,60000.0
1,Taifur,24.0,Marketing,45000.0
2,Rohan,25.0,HR,40000.0


In [251]:
# Access the last three rows of data
df.tail(3)

Unnamed: 0,Name,Age,Department,Salary
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,50000.0
6,Taifur,24.0,Marketing,45000.0


In [252]:
# Access data with iloc
df.iloc[1:3, [0, 2]]

Unnamed: 0,Name,Department
1,Taifur,Marketing
2,Rohan,HR


In [253]:
# Access data with loc
df.loc[2:, ["Name", "Salary"]]

Unnamed: 0,Name,Salary
2,Rohan,40000.0
3,Arman,65000.0
4,Sufian,
5,Arifa,50000.0
6,Taifur,45000.0


In [254]:
# Get column
df["Name"] # Get single column
df[["Name", "Salary"]] # Get range of columns

Unnamed: 0,Name,Salary
0,Syful,60000.0
1,Taifur,45000.0
2,Rohan,40000.0
3,Arman,65000.0
4,Sufian,
5,Arifa,50000.0
6,Taifur,45000.0


In [255]:
# Drop column if the column has nan value
df.drop("Age", axis=1)

Unnamed: 0,Name,Department,Salary
0,Syful,IT,60000.0
1,Taifur,Marketing,45000.0
2,Rohan,HR,40000.0
3,Arman,IT,65000.0
4,Sufian,HR,
5,Arifa,HR,50000.0
6,Taifur,Marketing,45000.0


In [256]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,60000.0
1,Taifur,24.0,Marketing,45000.0
2,Rohan,25.0,HR,40000.0
3,Arman,,IT,65000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,50000.0
6,Taifur,24.0,Marketing,45000.0


In [257]:
# Get the shape of dataset or dataframe
df.shape

(7, 4)

In [258]:
# Get basic info about each column of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        7 non-null      object 
 1   Age         6 non-null      float64
 2   Department  7 non-null      object 
 3   Salary      6 non-null      float64
dtypes: float64(2), object(2)
memory usage: 356.0+ bytes


In [259]:
# Get the basic statistics of dataframe
df.describe()

Unnamed: 0,Age,Salary
count,6.0,6.0
mean,24.333333,50833.333333
std,1.966384,9703.951085
min,21.0,40000.0
25%,24.0,45000.0
50%,24.5,47500.0
75%,25.0,57500.0
max,27.0,65000.0


In [260]:
# Broadcasting employ salary by 5000
df["Salary"] = df["Salary"] + 5000
df

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,65000.0
1,Taifur,24.0,Marketing,50000.0
2,Rohan,25.0,HR,45000.0
3,Arman,,IT,70000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,55000.0
6,Taifur,24.0,Marketing,50000.0


In [261]:
# Raname column name 
df.rename(columns = {"Department": "Dept."}, inplace=True)
df

Unnamed: 0,Name,Age,Dept.,Salary
0,Syful,25.0,IT,65000.0
1,Taifur,24.0,Marketing,50000.0
2,Rohan,25.0,HR,45000.0
3,Arman,,IT,70000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,55000.0
6,Taifur,24.0,Marketing,50000.0


In [262]:
# Get the unique values in particular column
df["Dept."].unique()

array(['IT', 'Marketing', 'HR'], dtype=object)

In [263]:
# Group by the columns unique values to get the count of the particular values
df["Dept."].value_counts()

Dept.
HR           3
IT           2
Marketing    2
Name: count, dtype: int64

In [264]:
df["Promoted Salary"] = df["Salary"] + df["Salary"] * 0.1
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Rohan,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,77000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0
6,Taifur,24.0,Marketing,50000.0,55000.0


In [265]:
# Check the null value per column
df.isnull().sum()

Name               0
Age                1
Dept.              0
Salary             1
Promoted Salary    1
dtype: int64

In [266]:
# Drop null value
df.dropna() # Drop a row if any data is null in the particular row
df.dropna(how="all") # Drop a row if all data is null in the particular row
df.dropna(how="any") # It works like the default

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Rohan,25.0,HR,45000.0,49500.0
5,Arifa,21.0,HR,55000.0,60500.0
6,Taifur,24.0,Marketing,50000.0,55000.0


In [267]:
# Fill the null value
df.fillna(0) # Fill the null value with zero
df["Age"].fillna(df["Age"].mean()) # Fill the null value of age column with average
df["Salary"].fillna(df["Salary"].median()) # Fill the null value of salary with median
# df["Age"].fillna(method="ffill") # Fill the null value with forward fill
df["Age"].ffill() # same as above
df["Age"].fillna(method="bfill") # Fill the null value with backword fill

  df["Age"].fillna(method="bfill") # Fill the null value with backword fill


0    25.0
1    24.0
2    25.0
3    27.0
4    27.0
5    21.0
6    24.0
Name: Age, dtype: float64

In [268]:
# Change any particular value corresponding to row and column
df["Name"] = df["Name"].replace("Rohan", "Nazmul")
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,77000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0
6,Taifur,24.0,Marketing,50000.0,55000.0


In [269]:
# Get the duplicated dataset
duplicated = df[df.duplicated(keep="first")]
duplicated

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
6,Taifur,24.0,Marketing,50000.0,55000.0


In [270]:
# drop the duplicated row
df = df.drop_duplicates()
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,77000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0


In [271]:
# Invalid values with lambda function
df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/1.1 if x > 70000 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/1.1 if x > 70000 else x)


In [272]:
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,65000.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,70000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0


In [276]:
# apply and lambda function
def double_age(x):
    return x * x

df["Age"] = df["Age"].apply(double_age)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].apply(double_age)


Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,390625.0,IT,65000.0,65000.0
1,Taifur,331776.0,Marketing,50000.0,55000.0
2,Nazmul,390625.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,70000.0
4,Sufian,531441.0,HR,,
5,Arifa,194481.0,HR,55000.0,60500.0


In [278]:
df["Age"] = df["Age"].apply(lambda x: np.sqrt(x))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].apply(lambda x: np.sqrt(x))


Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,65000.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,70000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0
