# Pandas Practices

In [1]:
# Import pandas and numpy
import numpy as np
import pandas as pd

In [2]:
# Create Series
series = pd.Series([3, 53, 22, 52, 43])
print(series)

0     3
1    53
2    22
3    52
4    43
dtype: int64


In [3]:
# Check the values of the series
series.values

array([ 3, 53, 22, 52, 43])

In [4]:
# Check datatype of the series
series.dtype

dtype('int64')

In [5]:
# Check the indexing of the series
series.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
# Check the heading of series
series.name

In [7]:
# Assign the heading of series
series.name = "Numbers"
print(series)

0     3
1    53
2    22
3    52
4    43
Name: Numbers, dtype: int64


In [8]:
# Indexing 
series[0] # Get the first element
series[len(series) - 1] # Get the last element
series[0:3] # Get the range of the element

0     3
1    53
2    22
Name: Numbers, dtype: int64

In [9]:
# Indexing with iloc function
series.iloc[3] # Get specific element
series.iloc[[1, 2, 4]] # Get specifi range of element

1    53
2    22
4    43
Name: Numbers, dtype: int64

In [10]:
# Assign the name to the index of series
index = ["Apple", "Banana", "Mango", "Orange", "Grape"]
series.index = index
series.name = "Calories"
series

Apple      3
Banana    53
Mango     22
Orange    52
Grape     43
Name: Calories, dtype: int64

In [11]:
# Access the series with name index
series["Mango"]

np.int64(22)

In [12]:
# Accessing label based indexing with loc funtion
series.loc["Mango"] # Get specific element
series.loc[["Mango", "Grape", "Banana"]] # Get specific range of element

Mango     22
Grape     43
Banana    53
Name: Calories, dtype: int64

In [13]:
# Create Series with the help of dictionary
protien = {
    "Beef": 39,
    "Chicken": 26,
    "Egg": 6,
    "Rice": 1
}

series2 = pd.Series(protien, name="Protien per 100gm")
series2

Beef       39
Chicken    26
Egg         6
Rice        1
Name: Protien per 100gm, dtype: int64

In [14]:
# Conditional Selection
series2[series2 > 6]

Beef       39
Chicken    26
Name: Protien per 100gm, dtype: int64

In [15]:
# Logical and, or, and not operation
series2[(series2 > 6) & (series2 < 39)]
series2[(series2 > 6) | (series2 < 20)]

Beef       39
Chicken    26
Egg         6
Rice        1
Name: Protien per 100gm, dtype: int64

In [16]:
# Modify the element of series
series2["Beef"] = 40
series2

Beef       40
Chicken    26
Egg         6
Rice        1
Name: Protien per 100gm, dtype: int64

In [17]:
# exercise
ser = pd.Series(['a', np.nan, 1, np.nan, 2])
ser.notnull().sum()

np.int64(3)

In [18]:
# Create pandas dataframe
data = {
    "Name": ["Syful", "Taifur", "Rohan", "Arman", "Sufian", "Arifa", "Taifur"],
    "Age": [25, 24, 25, np.nan, 27, 21, 24],
    "Department": ["IT", "Marketing", "HR", "IT", "HR", "HR", "Marketing"],
    "Salary": [60000, 45000, 40000, 65000, np.nan, 50000, 45000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,60000.0
1,Taifur,24.0,Marketing,45000.0
2,Rohan,25.0,HR,40000.0
3,Arman,,IT,65000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,50000.0
6,Taifur,24.0,Marketing,45000.0


In [19]:
# Access the first three rows of data
df.head(3)

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,60000.0
1,Taifur,24.0,Marketing,45000.0
2,Rohan,25.0,HR,40000.0


In [20]:
# Access the last three rows of data
df.tail(3)

Unnamed: 0,Name,Age,Department,Salary
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,50000.0
6,Taifur,24.0,Marketing,45000.0


In [21]:
# Access data with iloc
df.iloc[1:3, [0, 2]]

Unnamed: 0,Name,Department
1,Taifur,Marketing
2,Rohan,HR


In [22]:
# Access data with loc
df.loc[2:, ["Name", "Salary"]]

Unnamed: 0,Name,Salary
2,Rohan,40000.0
3,Arman,65000.0
4,Sufian,
5,Arifa,50000.0
6,Taifur,45000.0


In [23]:
# Get column
df["Name"] # Get single column
df[["Name", "Salary"]] # Get range of columns

Unnamed: 0,Name,Salary
0,Syful,60000.0
1,Taifur,45000.0
2,Rohan,40000.0
3,Arman,65000.0
4,Sufian,
5,Arifa,50000.0
6,Taifur,45000.0


In [24]:
# Drop column if the column has nan value
df.drop("Age", axis=1)

Unnamed: 0,Name,Department,Salary
0,Syful,IT,60000.0
1,Taifur,Marketing,45000.0
2,Rohan,HR,40000.0
3,Arman,IT,65000.0
4,Sufian,HR,
5,Arifa,HR,50000.0
6,Taifur,Marketing,45000.0


In [25]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,60000.0
1,Taifur,24.0,Marketing,45000.0
2,Rohan,25.0,HR,40000.0
3,Arman,,IT,65000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,50000.0
6,Taifur,24.0,Marketing,45000.0


In [26]:
# Get the shape of dataset or dataframe
df.shape

(7, 4)

In [27]:
# Get basic info about each column of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        7 non-null      object 
 1   Age         6 non-null      float64
 2   Department  7 non-null      object 
 3   Salary      6 non-null      float64
dtypes: float64(2), object(2)
memory usage: 356.0+ bytes


In [28]:
# Get the basic statistics of dataframe
df.describe()

Unnamed: 0,Age,Salary
count,6.0,6.0
mean,24.333333,50833.333333
std,1.966384,9703.951085
min,21.0,40000.0
25%,24.0,45000.0
50%,24.5,47500.0
75%,25.0,57500.0
max,27.0,65000.0


In [29]:
# Broadcasting employ salary by 5000
df["Salary"] = df["Salary"] + 5000
df

Unnamed: 0,Name,Age,Department,Salary
0,Syful,25.0,IT,65000.0
1,Taifur,24.0,Marketing,50000.0
2,Rohan,25.0,HR,45000.0
3,Arman,,IT,70000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,55000.0
6,Taifur,24.0,Marketing,50000.0


In [30]:
# Raname column name 
df.rename(columns = {"Department": "Dept."}, inplace=True)
df

Unnamed: 0,Name,Age,Dept.,Salary
0,Syful,25.0,IT,65000.0
1,Taifur,24.0,Marketing,50000.0
2,Rohan,25.0,HR,45000.0
3,Arman,,IT,70000.0
4,Sufian,27.0,HR,
5,Arifa,21.0,HR,55000.0
6,Taifur,24.0,Marketing,50000.0


In [31]:
# Get the unique values in particular column
df["Dept."].unique()

array(['IT', 'Marketing', 'HR'], dtype=object)

In [32]:
# Group by the columns unique values to get the count of the particular values
df["Dept."].value_counts()

Dept.
HR           3
IT           2
Marketing    2
Name: count, dtype: int64

In [33]:
df["Promoted Salary"] = df["Salary"] + df["Salary"] * 0.1
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Rohan,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,77000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0
6,Taifur,24.0,Marketing,50000.0,55000.0


In [34]:
# Check the null value per column
df.isnull().sum()

Name               0
Age                1
Dept.              0
Salary             1
Promoted Salary    1
dtype: int64

In [35]:
# Drop null value
df.dropna() # Drop a row if any data is null in the particular row
df.dropna(how="all") # Drop a row if all data is null in the particular row
df.dropna(how="any") # It works like the default

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Rohan,25.0,HR,45000.0,49500.0
5,Arifa,21.0,HR,55000.0,60500.0
6,Taifur,24.0,Marketing,50000.0,55000.0


In [36]:
# Fill the null value
df.fillna(0) # Fill the null value with zero
df["Age"].fillna(df["Age"].mean()) # Fill the null value of age column with average
df["Salary"].fillna(df["Salary"].median()) # Fill the null value of salary with median
# df["Age"].fillna(method="ffill") # Fill the null value with forward fill
df["Age"].ffill() # same as above
df["Age"].fillna(method="bfill") # Fill the null value with backword fill

  df["Age"].fillna(method="bfill") # Fill the null value with backword fill


0    25.0
1    24.0
2    25.0
3    27.0
4    27.0
5    21.0
6    24.0
Name: Age, dtype: float64

In [37]:
# Change any particular value corresponding to row and column
df["Name"] = df["Name"].replace("Rohan", "Nazmul")
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,77000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0
6,Taifur,24.0,Marketing,50000.0,55000.0


In [38]:
# Get the duplicated dataset
duplicated = df[df.duplicated(keep="first")]
duplicated

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
6,Taifur,24.0,Marketing,50000.0,55000.0


In [39]:
# drop the duplicated row
df = df.drop_duplicates()
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,71500.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,77000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0


In [40]:
# Invalid values with lambda function
df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/1.1 if x > 70000 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/1.1 if x > 70000 else x)


In [41]:
df

Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,65000.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,70000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0


In [42]:
# apply and lambda function
def double_age(x):
    return x * x

df["Age"] = df["Age"].apply(double_age)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].apply(double_age)


Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,625.0,IT,65000.0,65000.0
1,Taifur,576.0,Marketing,50000.0,55000.0
2,Nazmul,625.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,70000.0
4,Sufian,729.0,HR,,
5,Arifa,441.0,HR,55000.0,60500.0


In [43]:
df["Age"] = df["Age"].apply(lambda x: np.sqrt(x))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].apply(lambda x: np.sqrt(x))


Unnamed: 0,Name,Age,Dept.,Salary,Promoted Salary
0,Syful,25.0,IT,65000.0,65000.0
1,Taifur,24.0,Marketing,50000.0,55000.0
2,Nazmul,25.0,HR,45000.0,49500.0
3,Arman,,IT,70000.0,70000.0
4,Sufian,27.0,HR,,
5,Arifa,21.0,HR,55000.0,60500.0


# Pandas Practice From Another Video

In [44]:
# import pandas and numpy
import pandas as pd
import numpy as np

In [45]:
# Create Series
values = [10, 20, 30, 40, 50, 60]
s = pd.Series(values, index=["a", "b", "c", "d", "e", "f"], name="Numbers")
print(s)

a    10
b    20
c    30
d    40
e    50
f    60
Name: Numbers, dtype: int64


In [46]:
# Crete DataFrame
df = pd.DataFrame({
    "name": ["Mike", "Bob", "Alice"],
    "age": [30, 80, 45],
    "job": ["Programmer", "Clerk", "Designer"]
})

df = df.set_index("name")
df

Unnamed: 0_level_0,age,job
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mike,30,Programmer
Bob,80,Clerk
Alice,45,Designer


In [88]:
# Reset index
df.reset_index(inplace=True)
df

TypeError: Cannot reset_index inplace on a Series to create a DataFrame

In [48]:
# export as csv
df.to_csv("my-data.csv", index=None)

In [49]:
# import data
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True).frame

In [50]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [51]:
data.shape

(20640, 9)

In [60]:
# Get the random data
data.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
10385,6.4114,15.0,7.527559,1.049869,2529.0,3.318898,33.6,-117.65,2.787
13332,3.9336,15.0,5.553191,1.06874,2039.0,3.337152,34.04,-117.65,1.51
16507,4.4213,11.0,7.345936,1.132325,1605.0,3.034026,37.74,-121.11,1.827
15515,5.4819,12.0,5.759346,0.962617,1226.0,2.864486,33.17,-117.1,1.838
1135,2.3711,20.0,5.397015,0.997015,763.0,2.277612,39.74,-121.64,1.218


In [62]:
# Get the columns of dataset
list(data.columns)

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'MedHouseVal']

In [64]:
# Set the max columns you can see
pd.options.display.max_columns = 500
data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [65]:
# Get the statistics of individual column
data["HouseAge"].describe()

count    20640.000000
mean        28.639486
std         12.585558
min          1.000000
25%         18.000000
50%         29.000000
75%         37.000000
max         52.000000
Name: HouseAge, dtype: float64

In [80]:
# Access value with at or iat
df.at["Mike", "age"]

np.int64(30)

In [93]:
# apply custom function
df = pd.DataFrame({
    "name": ["Mike", "Bob", "Alice"],
    "age": [30, 80, 45],
    "job": ["Programmer", "Clerk", "Designer"]
})
def double_age(x):
    return x * 2

def profession(x):
    if x.endswith("r"):
        return "Tech"
    else:
        return x

df["age"] = df["age"].apply(double_age)
df["job"] = df["job"].apply(profession)
df

Unnamed: 0,name,age,job
0,Mike,60,Tech
1,Bob,160,Clerk
2,Alice,90,Tech


In [94]:
# Use function with the use of lambda function
df["age"] = df["age"].apply(lambda x: x / 2 if x >= 60 else x)
df

Unnamed: 0,name,age,job
0,Mike,30.0,Tech
1,Bob,80.0,Clerk
2,Alice,45.0,Tech


In [96]:
# Create new column with combinging other column
df["summary"] = df.apply(lambda row: f"{row["age"]}-{row["job"]}", axis=1)
df

Unnamed: 0,name,age,job,summary
0,Mike,30.0,Tech,30.0-Tech
1,Bob,80.0,Clerk,80.0-Clerk
2,Alice,45.0,Tech,45.0-Tech


In [99]:
# Drop any column
df = df.drop("summary", axis=1)
df

Unnamed: 0,name,age,job
0,Mike,30.0,Tech
1,Bob,80.0,Clerk
2,Alice,45.0,Tech


In [102]:
# iterate through the rows
for i, row in df.iterrows():
    print(i)
    print(row)

0
name    Mike
age     30.0
job     Tech
Name: 0, dtype: object
1
name      Bob
age      80.0
job     Clerk
Name: 1, dtype: object
2
name    Alice
age      45.0
job      Tech
Name: 2, dtype: object


In [104]:
# iterate through column
for i, col in df.items():
    print(i)
    print(col)

name
0     Mike
1      Bob
2    Alice
Name: name, dtype: object
age
0    30.0
1    80.0
2    45.0
Name: age, dtype: float64
job
0     Tech
1    Clerk
2     Tech
Name: job, dtype: object


In [105]:
# Set the birthday 
import datetime as dt
df["birthday"] = df["age"].apply(lambda x: dt.datetime.now() - dt.timedelta(days=365*x))
df

Unnamed: 0,name,age,job,birthday
0,Mike,30.0,Tech,1995-12-02 11:31:52.734919
1,Bob,80.0,Clerk,1945-12-14 11:31:52.734940
2,Alice,45.0,Tech,1980-12-05 11:31:52.734946


In [106]:
df[df["birthday"].dt.year > 1950]

Unnamed: 0,name,age,job,birthday
0,Mike,30.0,Tech,1995-12-02 11:31:52.734919
2,Alice,45.0,Tech,1980-12-05 11:31:52.734946


In [107]:
df[df["age"].isin([30.0, 45.0])]

Unnamed: 0,name,age,job,birthday
0,Mike,30.0,Tech,1995-12-02 11:31:52.734919
2,Alice,45.0,Tech,1980-12-05 11:31:52.734946


In [115]:
# You can use query function to do use condition easily
df["age"] = df["age"].astype("int64")
df.query("age < 50")

Unnamed: 0,name,age,job,birthday
0,Mike,30,Tech,1995-12-02 11:31:52.734919
2,Alice,45,Tech,1980-12-05 11:31:52.734946


In [120]:
# Use groupby method
df.groupby("job").agg({
    "age": ["sum", "mean"],
    "birthday": "count"
})

Unnamed: 0_level_0,age,age,birthday
Unnamed: 0_level_1,sum,mean,count
job,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Clerk,80,80.0,1
Tech,75,37.5,2


In [125]:
# Sort the dataframe
df.sort_values("age", ascending=False)

Unnamed: 0,name,age,job,birthday
1,Bob,80,Clerk,1945-12-14 11:31:52.734940
2,Alice,45,Tech,1980-12-05 11:31:52.734946
0,Mike,30,Tech,1995-12-02 11:31:52.734919


In [128]:
# Concatenate the dataset
df1 = pd.DataFrame({
    "Item": ["A", "B", "C"],
    "Price": [10, 20, 30]
})

df2 = pd.DataFrame({
    "Item": ["D", "E", "F"],
    "Price": [40, 50, 60]
})

pd.concat([df1, df2]).reset_index().drop("index", axis=1)

Unnamed: 0,Item,Price
0,A,10
1,B,20
2,C,30
3,D,40
4,E,50
5,F,60


In [130]:
# More Join functionalities
df1 = pd.DataFrame({
    "Item": ["A", "B", "C"],
    "Price": [10, 20, 30]
})

df2 = pd.DataFrame({
    "Country": ["X", "Y", "Z"],
    "Available": [True, True, False]
})

# Outer join
pd.concat([df1, df2])
pd.concat([df1, df2], axis=1)

Unnamed: 0,Item,Price,Country,Available
0,A,10,X,True
1,B,20,Y,True
2,C,30,Z,False


In [131]:
df3 = pd.DataFrame({
    "Item": ["B", "C", "D"],
    "Country": ["X", "Y", "Z"]
})

In [137]:
# Inner join
pd.merge(df1, df3)

# Outer join
pd.merge(df1, df3, how="outer")

# Left join
pd.merge(df1, df3, how="left")

# Right join
pd.merge(df1, df3, how="right")

# Join based on item column
pd.merge(df1, df3, on="Item", how="outer")

Unnamed: 0,Item,Price,Country
0,A,10.0,
1,B,20.0,X
2,C,30.0,Y
3,D,,Z


In [139]:
# Join based on index
df4 = pd.DataFrame({
    "Price": [10, 20, 30]
}, index=["A", "B", "C"])

df5 = pd.DataFrame({
    "Country": ["X", "Y", "Z"]
}, index=["B", "C", "D"])

df4.join(df5, how="inner")

Unnamed: 0,Price,Country
B,20,X
C,30,Y
