In [1]:
import pandas as pd
import numpy as np

df=pd.read_csv('data.csv')

In [2]:
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [4]:
df.tail(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [5]:
df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [15]:
## Handling missing values.
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [16]:
## Value and Sales columns show the missing values.

## To fill the missing value with zero.

df_filled = df.fillna(0)

In [22]:
## Another better way of filling the missing value is using the mean or median.

df['sales_filled'] = df['Sales'].fillna(df['Sales'].mean())

df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region,sales_filled
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


In [25]:
## Renaming a column. Date with sales_date.
df =df.rename(columns={'Date':'sales_date'})

df.head(5)

Unnamed: 0,sales_date,Category,Value,Product,Sales,Region,sales_filled
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


In [29]:
## Change the data type of columns.

## If there are any missing or nan value then you must need to fix them and then change the data type.

df['value_new']=df['Value'].fillna(df['Value'].mean()).astype(int)

df.head(5)

Unnamed: 0,sales_date,Category,Value,Product,Sales,Region,sales_filled,value_New,value_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,32
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,26


In [33]:
## Changes to one column.

df['value_update'] = df['Value'].apply(lambda x:x*2)

df.head(5)

Unnamed: 0,sales_date,Category,Value,Product,Sales,Region,sales_filled,value_new,value_update
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0


In [31]:
df.head(5)

Unnamed: 0,sales_date,Category,Value,Product,Sales,Region,sales_filled,value_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26


In [35]:
## Data aggregating and grouping.

product_mean=df.groupby('Product')['Value'].mean()

print(product_mean)

Product
Product1    46.214286
Product2    52.800000
Product3    55.166667
Name: Value, dtype: float64


In [41]:
group_prd_rgn_sum = df.groupby(['Product','Region'])['Value'].sum()

print(group_prd_rgn_sum)


Product   Region
Product1  East      292.0
          North       9.0
          South     100.0
          West      246.0
Product2  East       56.0
          North     127.0
          South     181.0
          West      428.0
Product3  East      202.0
          North     203.0
          South     215.0
          West      373.0
Name: Value, dtype: float64


In [43]:
## Multiple aggregate function.

group_agg_fn = df.groupby('Region')['Value'].agg(['mean','sum','median'])

print(group_agg_fn)

             mean     sum  median
Region                           
East    42.307692   550.0    32.0
North   37.666667   339.0    39.0
South   62.000000   496.0    66.5
West    61.588235  1047.0    60.0


In [44]:
## Merging and joining Data frame.

df1 = pd.DataFrame({'Key': ['A', 'B', 'C'], 'Value1': [1, 2, 3]})
df2 = pd.DataFrame({'Key': ['A', 'B', 'D'], 'Value2': [4, 5, 6]})

In [45]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [46]:
df2

Unnamed: 0,Key,Value2
0,A,4
1,B,5
2,D,6


In [49]:
inner_join_df=pd.merge(df1,df2,on='Key',how="inner")

inner_join_df

Unnamed: 0,Key,Value1,Value2
0,A,1,4
1,B,2,5


In [51]:
outer_join_df=pd.merge(df1,df2,on='Key',how="outer")

print(outer_join_df)

  Key  Value1  Value2
0   A     1.0     4.0
1   B     2.0     5.0
2   C     3.0     NaN
3   D     NaN     6.0


In [52]:
left_outer_join_df=pd.merge(df1,df2,on='Key',how="left")

print(left_outer_join_df)

  Key  Value1  Value2
0   A       1     4.0
1   B       2     5.0
2   C       3     NaN


In [53]:
right_outer_join_df=pd.merge(df1,df2,on='Key',how="right")

print(right_outer_join_df)

  Key  Value1  Value2
0   A     1.0       4
1   B     2.0       5
2   D     NaN       6
