# PYTHON - PANDAS TUTORIAL

In [1]:
import pandas as pd
import numpy as np

# 1. Pandas - select rows

Provides 2 ways to select data.

* Select Rows by Integer Index - **iloc[]** -> df.iloc[start:stop:step]
* Select Rows by Index Label - **loc[]** -> df.loc[start:stop:step]

In [8]:
technologies = {
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas","Oracle","Java"],
    'Fee' :[20000,25000,26000,22000,24000,21000,22000],
    'Duration':['30days','40days','35days','40days',np.nan,None,'55days'],
    'Discount':[1000,2300,1500,1200,2500,2100,2000]
               }
index_labels=['r1','r2','r3','r4','r5','r6','r7']
df = pd.DataFrame(technologies,index=index_labels)
print("Create DataFrame:\n", df)

Create DataFrame:
     Courses    Fee Duration  Discount
r1    Spark  20000   30days      1000
r2  PySpark  25000   40days      2300
r3   Hadoop  26000   35days      1500
r4   Python  22000   40days      1200
r5   Pandas  24000      NaN      2500
r6   Oracle  21000     None      2100
r7     Java  22000   55days      2000


In [9]:
# Select Row by Index
print('--> df.iloc[2] <--')
print(df.iloc[2])
print('-------------------------')
# Select Rows by Index List
print('--> df.iloc[[2,3,6]] <--')
print(df.iloc[[2,3,6]])
print('-------------------------')
# Select Rows by Integer Index Range
print('--> df.iloc[1:5] <--')
print(df.iloc[1:5])
print('-------------------------')
# Select First Row
print('--> df.iloc[:1] <--')
print(df.iloc[:1])
print('-------------------------')
# Select First 3 Rows
print('--> df.iloc[:3] <--')
print(df.iloc[:3])
print('-------------------------')
# Select Last Row
print('--> df.iloc[-1:] <--')
print(df.iloc[-1:])
print('-------------------------')
# Select Last 3 Row
print('--> df.iloc[-3:] <--')
print(df.iloc[-3:])
print('-------------------------')
# Selects alternate rows
print('--> df.iloc[::2] <--')
print(df.iloc[::2])
print('-------------------------')
# Select Row by Index Label
print('--> df.loc[r2] <--')
print(df.loc['r2'])
print('-------------------------')
# Select Rows by Index Label List
print('--> df.loc[[r2,r3,r6]] <--')
print(df.loc[['r2','r3','r6']])
print('-------------------------')
# Select Rows by Label Index Range
print('--> df.loc[r1:r5] <--')
print(df.loc['r1':'r5'])
print('-------------------------')
# Select Rows by Label Index Range
print('--> df.loc[r1:r5] <--')
print(df.loc['r1':'r5'])
print('-------------------------')
# Select Alternate Rows with in Index Labels
print('--> df.loc[r1:r5:2] <--')
print(df.loc['r1':'r5':2])

--> df.iloc[2] <--
Courses     Hadoop
Fee          26000
Duration    35days
Discount      1500
Name: r3, dtype: object
-------------------------
--> df.iloc[[2,3,6]] <--
   Courses    Fee Duration  Discount
r3  Hadoop  26000   35days      1500
r4  Python  22000   40days      1200
r7    Java  22000   55days      2000
-------------------------
--> df.iloc[1:5] <--
    Courses    Fee Duration  Discount
r2  PySpark  25000   40days      2300
r3   Hadoop  26000   35days      1500
r4   Python  22000   40days      1200
r5   Pandas  24000      NaN      2500
-------------------------
--> df.iloc[:1] <--
   Courses    Fee Duration  Discount
r1   Spark  20000   30days      1000
-------------------------
--> df.iloc[:3] <--
    Courses    Fee Duration  Discount
r1    Spark  20000   30days      1000
r2  PySpark  25000   40days      2300
r3   Hadoop  26000   35days      1500
-------------------------
--> df.iloc[-1:] <--
   Courses    Fee Duration  Discount
r7    Java  22000   55days      2000
------

# 2. Pandas - select columns

Same as selecting rows above, only difference being column indexes or column lables must be provided after the "," in both loc[] and iloc[]

Provides 2 ways to select data.

* Select Rows by Integer Index - **iloc[]** -> df.iloc[ : ,start:stop]
* Select Rows by Index Label - **loc[]** -> df.loc[ : , start:stop]

In [13]:
technologies = {
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas","Oracle","Java"],
    'Fee' :[20000,25000,26000,22000,24000,21000,22000],
    'Duration':['30days','40days','35days','40days',np.nan,None,'55days'],
    'Discount':[1000,2300,1500,1200,2500,2100,2000]
               }
index_labels=['r1','r2','r3','r4','r5','r6','r7']
df = pd.DataFrame(technologies,index=index_labels)

print(df.loc[:, ["Courses","Fee","Duration"]]) # Selecte multiple columns
print('-------------------')
print(df.iloc[:,2:]) # Select From 3rd to end

    Courses    Fee Duration
r1    Spark  20000   30days
r2  PySpark  25000   40days
r3   Hadoop  26000   35days
r4   Python  22000   40days
r5   Pandas  24000      NaN
r6   Oracle  21000     None
r7     Java  22000   55days
-------------------
   Duration  Discount
r1   30days      1000
r2   40days      2300
r3   35days      1500
r4   40days      1200
r5      NaN      2500
r6     None      2100
r7   55days      2000


# 3. Pandas - query()

### Syntax:
DataFrame.query(expr, inplace=False, **kwargs)

* **expr** – This parameter specifies the query expression string, which follows Python’s syntax for conditional expressions.
* **inplace** – Defaults to False. When it is set to True, it updates the existing DataFrame, and query() method returns None.
* ** **kwargs** –  This parameter allows passing additional keyword arguments to the query expression. It is optional. Keyword arguments that work with eval()

In [14]:
technologies= {
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas"],
    'Fee' :[22000,25000,23000,24000,26000],
    'Duration':['30days','50days','30days', None,np.nan],
    'Discount':[1000,2300,1000,1200,2500]
          }
df = pd.DataFrame(technologies)
print("Create DataFrame:\n", df)

Create DataFrame:
    Courses    Fee Duration  Discount
0    Spark  22000   30days      1000
1  PySpark  25000   50days      2300
2   Hadoop  23000   30days      1000
3   Python  24000     None      1200
4   Pandas  26000      NaN      2500


In [16]:

# Query all rows with Courses equals 'Spark'
df2 = df.query("Courses == 'Spark'")
print("After filtering the rows based on condition:\n", df2)

print('--------------------')

# Query rows by list of values
df2 = df.query("Courses in ('Spark','PySpark')")
print("After filtering the rows based on condition:\n", df2)

print('--------------------')

# Query by multiple conditions
df2 = df.query("Fee >= 23000 and Fee <= 24000")
print("After filtering the rows based on multiple conditions:\n", df2)

After filtering the rows based on condition:
   Courses    Fee Duration  Discount
0   Spark  22000   30days      1000
--------------------
After filtering the rows based on condition:
    Courses    Fee Duration  Discount
0    Spark  22000   30days      1000
1  PySpark  25000   50days      2300
--------------------
After filtering the rows based on multiple conditions:
   Courses    Fee Duration  Discount
2  Hadoop  23000   30days      1000
3  Python  24000     None      1200


### Query Rows using apply()

In [17]:
# By using lambda function
df2 = df.apply(lambda row: row[df['Courses'].isin(['Spark','PySpark'])])
print("After filtering the rows based on condition:\n", df2)

After filtering the rows based on condition:
    Courses    Fee Duration  Discount
0    Spark  22000   30days      1000
1  PySpark  25000   50days      2300


### Other Examples using df[] and loc[]

In [27]:
print(df.loc[df['Courses'].isin(['Spark','PySpark'])])

print('--------------------')

print(df.loc[(df['Discount'] >= 1200) & (df['Fee'] >= 23000 )])

print('--------------------')

# Select based on value contains
print(df[df['Courses'].str.contains("Spark")])

print('--------------------')

# Select startswith
print(df[df['Courses'].str.startswith("P")])

   Courses    Fee Duration  Discount
0    Spark  22000   30days      1000
1  PySpark  25000   50days      2300
--------------------
   Courses    Fee Duration  Discount
1  PySpark  25000   50days      2300
3   Python  24000     None      1200
4   Pandas  26000      NaN      2500
--------------------
   Courses    Fee Duration  Discount
0    Spark  22000   30days      1000
1  PySpark  25000   50days      2300
--------------------
   Courses    Fee Duration  Discount
1  PySpark  25000   50days      2300
3   Python  24000     None      1200
4   Pandas  26000      NaN      2500


# 4. Pandas - Get cell value

* Use .loc[] to get a cell value by row label and column label.
* Use .iloc[] to get a cell value by row and column index.
* at[] is a faster alternative for accessing a single cell using label-based indexing.
* .iat[] is similar to .at[], but uses integer-based indexing for faster access to a single cell.
* Convert the DataFrame to a NumPy array and access elements by array indexing.
* Prefer .at[] when performance is critical and only one value needs to be accessed.

In [28]:
# Create DataFrame
technologies = {
     'Courses':["Spark","PySpark","Hadoop","Python","pandas"],
     'Fee' :[24000,25000,25000,24000,24000],
     'Duration':['30day','50days','55days', '40days','60days'],
     'Discount':[1000,2300,1000,1200,2500]
          }
index_labels=['r1','r2','r3','r4','r5']
df = pd.DataFrame(technologies, index=index_labels)

In [32]:
# Using loc[]. Get cell value by name & index
print(df.loc['r4']['Duration'])
print('---------------------')

# Using iloc[]. Get cell value by index & name
print(df.iloc[3]['Duration'])
print('---------------------')

# Using DataFrame.at[]
print(df.at['r4','Duration'])
print(df.at[df.index[3],'Duration'])
print('---------------------')

# Using DataFrame.iat[]
print(df.iat[3,2])


40days
---------------------
40days
---------------------
40days
40days
---------------------
40days


# 5. Pandas - Add a new Column

### Syntax:
DataFrame.assign(**kwargs)


In [33]:
technologies= {
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas"],
    'Fee' :[22000,25000,23000,24000,26000],
    'Discount':[1000,2300,1000,1200,2500]
          }

df = pd.DataFrame(technologies)

### Add multiple columns

In [35]:
tutors = ['William', 'Henry', 'Michael', 'John', 'Messi']
MNCCompanies = ['TATA','HCL','Infosys','Google','Amazon']
df2 = df.assign(MNCComp = MNCCompanies,TutorsAssigned=tutors )
print("Add multiple columns to DataFrame:\n", df2)

Add multiple columns to DataFrame:
    Courses    Fee  Discount  MNCComp TutorsAssigned
0    Spark  22000      1000     TATA        William
1  PySpark  25000      2300      HCL          Henry
2   Hadoop  23000      1000  Infosys        Michael
3   Python  24000      1200   Google           John
4   Pandas  26000      2500   Amazon          Messi


### Add a column from existing

In [36]:
# Derive New Column from Existing Column
df = pd.DataFrame(technologies)
df2 = df.assign(Discount_Percent=lambda x: x.Fee * x.Discount / 100)
print("Add column to DataFrame:\n", df2)

Add column to DataFrame:
    Courses    Fee  Discount  Discount_Percent
0    Spark  22000      1000          220000.0
1  PySpark  25000      2300          575000.0
2   Hadoop  23000      1000          230000.0
3   Python  24000      1200          288000.0
4   Pandas  26000      2500          650000.0


### Append Column to Existing Pandas DataFrame

In [44]:
# Add New column to the existing DataFrame
df = pd.DataFrame(technologies)
MNCCompanies = ['TATA','HCL','Infosys','Google','Amazon']
df["MNCCompanies"] = MNCCompanies
print("Add column to DataFrame:\n", df)

Add column to DataFrame:
    Courses    Fee  Discount MNCCompanies
0    Spark  22000      1000         TATA
1  PySpark  25000      2300          HCL
2   Hadoop  23000      1000      Infosys
3   Python  24000      1200       Google
4   Pandas  26000      2500       Amazon


### Add Column to Specific Position of DataFrame

In [45]:

# Add new column at the specific position
# Add new column to the DataFrame
tutors = ['William', 'Henry', 'Michael', 'John', 'Messi']
df.insert(0,'Tutors', tutors)
print("Add column to DataFrame:\n", df)

Add column to DataFrame:
     Tutors  Courses    Fee  Discount MNCCompanies
0  William    Spark  22000      1000         TATA
1    Henry  PySpark  25000      2300          HCL
2  Michael   Hadoop  23000      1000      Infosys
3     John   Python  24000      1200       Google
4    Messi   Pandas  26000      2500       Amazon


### Add a Column From Dictionary Mapping

In [47]:
# Add new column by mapping to the existing column
df = pd.DataFrame(technologies)
tutors = {"Spark":"William", "PySpark":"Henry", "Hadoop":"Michael","Python":"John", "pandas":"Messi"}
df['Tutors'] = df['Courses'].map(tutors)
print("Add column to DataFrame:\n", df)

Add column to DataFrame:
    Courses    Fee  Discount   Tutors
0    Spark  22000      1000  William
1  PySpark  25000      2300    Henry
2   Hadoop  23000      1000  Michael
3   Python  24000      1200     John
4   Pandas  26000      2500      NaN


# 6. Pandas - Rename a column

### Syntax:
DataFrame.rename(mapper=None, index=None, columns=None, axis=None, 
       copy=True, inplace=False, level=None, errors='ignore')

**mapper** – dictionary or function to rename columns and indexes.  
**index** – dictionary or function to rename index. When using with axis param, it should be (mapper, axis=0) which is equivalent to index=mapper.  
**columns** – dictionary or function to rename columns. When using with axis param, it should be (mapper, axis=0) which is equivalent to column=mapper.  
**axis** – Value can be either 0 or index | 1 or columns. Default set to ‘0’.  
**copy** – Copies the data as well. Default set to True.  
**inplace** – Used to specify the DataFrame referred to be updated. Default to False. When used True, copy property will be ignored.  
**level** – Used with MultiIndex. Takes Integer value. Default set to None.  
**errors** – Take values raise or ignore. if ‘raise’ is used, raise a KeyError when a dict-like mapper, index, or column contains labels that are not present in the Index being transformed. If ‘ignore’ is used, existing keys will be renamed and extra keys will be ignored. Default set to ignore.

* Pandas Rename Scenario  ->	Rename Column Example   
* Rename columns with list  ->	df.columns=[‘A’,’B’,’C’]   
* Rename column name by index  ->	df.columns.values[2] = “C”   
* Rename the column using Dict  ->	df2=df.rename(columns={‘a’: ‘A’, ‘b’: ‘B’})   
* Rename column using Dict & axis  ->	df2=df.rename({‘a’: ‘A’, ‘b’: ‘B’}, axis=1) 
* Rename column in place  ->	df2=df.rename({‘a’: ‘A’, ‘b’: ‘B’}, axis=’columns’)  
* df.rename(columns={‘a’: ‘A’, ‘b’: ‘B’}, in place = True)  ->	df.rename(columns={‘a’: ‘A’, ‘b’: ‘B’}, inplace = True)  
* Rename using lambda function  ->	df.rename(columns=lambda x: x[1:], inplace=True)  
* Rename with error  ->	df.rename(columns = {‘x’:’X’}, errors = “raise”)  
* Rename using set_axis()  ->	df2=df.set_axis([‘A’,’B’,’C’], axis=1)  

In [51]:
technologies = ({
  'Courses':["Spark","PySpark","Hadoop","Python","pandas","Oracle","Java"],
  'Fee' :[20000,25000,26000,22000,24000,21000,22000],
  'Duration':['30day', '40days' ,'35days', '40days', '60days', '50days', '55days']
              })
df = pd.DataFrame(technologies)

# Rename a Single Column 
df2=df.rename(columns = {'Courses':'Courses_List'})
print(df2.columns)


# Replace existing DataFrame (inplace). This returns None.
df.rename({'Courses':'Courses_List'}, axis='columns', inplace=True)
print(df.columns)

# Rename multiple columns
df.rename(columns = {'Courses':'Courses_List','Fee':'Courses_Fee', 
   'Duration':'Courses_Duration'}, inplace = True)
print(df.columns)

Index(['Courses_List', 'Fee', 'Duration'], dtype='object')
Index(['Courses_List', 'Fee', 'Duration'], dtype='object')
Index(['Courses_List', 'Courses_Fee', 'Courses_Duration'], dtype='object')


In [54]:
# Pandas rename column by index
df.columns.values[2] = "Courses_Duration_2"
print(df.columns)


# Rename columns with list
column_names = ['Courses_List','Courses_Fee','Courses_Duration']
df.columns = column_names
print(df.columns)

Index(['Courses_List', 'Courses_Fee', 'Courses_Duration_2'], dtype='object')
Index(['Courses_List', 'Courses_Fee', 'Courses_Duration'], dtype='object')


In [56]:
# Rename All Column Names by adding Suffix or Prefix
df.columns = column_names
df.columns = ['col_'+str(col) for col in df.columns]
print(df.columns)

Index(['col_Courses_List', 'col_Courses_Fee', 'col_Courses_Duration'], dtype='object')


In [57]:
# Add prefix to the column names
df2=df.add_prefix('pre_')
print(df2.columns)

Index(['pre_col_Courses_List', 'pre_col_Courses_Fee',
       'pre_col_Courses_Duration'],
      dtype='object')


In [59]:
# Add suffix to the column names
df2=df.add_suffix('_suf')
print(df2.columns)

Index(['col_Courses_List_suf', 'col_Courses_Fee_suf',
       'col_Courses_Duration_suf'],
      dtype='object')


In [61]:
# Rename using Lambda function
df.rename(columns=lambda x: 'col_'+x, inplace=True)
print(df.columns)

Index(['col_col_col_Courses_List', 'col_col_col_Courses_Fee',
       'col_col_col_Courses_Duration'],
      dtype='object')


In [62]:
# Change to all lower case
df = pd.DataFrame(technologies)
df2=df.rename(str.lower, axis='columns')
print(df2.columns)

# Change to all upper case
df = pd.DataFrame(technologies)
df2=df.rename(str.upper, axis='columns')
print(df2.columns)

Index(['courses', 'fee', 'duration'], dtype='object')
Index(['COURSES', 'FEE', 'DURATION'], dtype='object')


In [64]:
# Change column name using set_axis()
df.set_axis(['Courses_List', 'Course_Fee', 'Course_Duration'], axis=1)
print(df.columns)

Index(['Courses', 'Fee', 'Duration'], dtype='object')


In [65]:
# Change column name using String.replace()
df.columns = df.columns.str.replace("Fee","Courses_Fee")
print(df.columns)

Index(['Courses', 'Courses_Fee', 'Duration'], dtype='object')


In [66]:
# Throw Error when Rename column doesn't exists.
# df.rename(columns = {'Cour':'Courses_List'}, errors = "raise")

KeyError: "['Cour'] not found in axis"

# 7. Pandas - Get Row count

There are 3 ways to get the row count
1. len(df.index)
2. df.shape[0]
3. df.count()

In [67]:
technologies= {
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas"],
    'Courses Fee' :[22000,25000,23000,24000,26000],
    'Duration':['30days','50days','30days', None,np.nan],
    'Discount':[1000,2300,1000,1200,2500]
          }
df = pd.DataFrame(technologies)
print("Create DataFrame:\n", df)

Create DataFrame:
    Courses  Courses Fee Duration  Discount
0    Spark        22000   30days      1000
1  PySpark        25000   50days      2300
2   Hadoop        23000   30days      1000
3   Python        24000     None      1200
4   Pandas        26000      NaN      2500


In [68]:
# Get the row count using len(df.index)
print(df.index)

# Outputs: 
# RangeIndex(start=0, stop=5, step=1)

print('Row count is:', len(df.index))
print('Row count is:', len(df))

# Outputs:
# Row count is:5

RangeIndex(start=0, stop=5, step=1)
Row count is: 5
Row count is: 5


In [69]:

# Get the row count using len(df.axes[0])
print(df.axes)

# Output:
# [RangeIndex(start=0, stop=5, step=1), Index(['Courses', 'Courses Fee', 'Duration', 'Discount'], dtype='object')]

print(df.axes[0])

# Output:
# RangeIndex(start=0, stop=5, step=1)

print('Row count is:', len(df.axes[0]))

# Outputs:
# Row count is:5

[RangeIndex(start=0, stop=5, step=1), Index(['Courses', 'Courses Fee', 'Duration', 'Discount'], dtype='object')]
RangeIndex(start=0, stop=5, step=1)
Row count is: 5


In [70]:

# Get row count using df.shape[0]
df = pd.DataFrame(technologies)
row_count = df.shape[0]  # Returns number of rows
col_count = df.shape[1]  # Returns number of columns
print(row_count)

# Outputs:
# Number of rows: 5

5


In [72]:
# Get count of each column
print(df.count())

Courses        5
Courses Fee    5
Duration       3
Discount       5
dtype: int64


# 8. Pandas - Iterate over rows



In [73]:
technologies = ({
    'Courses':["Spark","PySpark","Hadoop","Python","pandas","Oracle","Java"],
    'Fee' :[20000,25000,26000,22000,24000,21000,22000],
    'Duration':['30day', '40days' ,'35days', '40days', '60days', '50days', '55days']
              })
df = pd.DataFrame(technologies)
print("Create DataFrame:", df)

Create DataFrame:    Courses    Fee Duration
0    Spark  20000    30day
1  PySpark  25000   40days
2   Hadoop  26000   35days
3   Python  22000   40days
4   pandas  24000   60days
5   Oracle  21000   50days
6     Java  22000   55days


In [74]:
# Iterate all rows 
# Using DataFrame.iterrows()
print("After iterating all rows:\n")
for index, row in df.iterrows():
    print (index,row["Fee"], row["Courses"], row["Duration"])

After iterating all rows:

0 20000 Spark 30day
1 25000 PySpark 40days
2 26000 Hadoop 35days
3 22000 Python 40days
4 24000 pandas 60days
5 21000 Oracle 50days
6 22000 Java 55days


In [75]:
# Row contains the column name and data
row = next(df.iterrows())[1]
print("Data For First Row :")
print(row)

Data For First Row :
Courses     Spark
Fee         20000
Duration    30day
Name: 0, dtype: object


In [76]:
# Iterate all rows 
# Using DataFrame.itertuples()
for row in df.itertuples(index = True):
    print (getattr(row,'Index'),getattr(row, "Fee"), getattr(row, "Courses"))

0 20000 Spark
1 25000 PySpark
2 26000 Hadoop
3 22000 Python
4 24000 pandas
5 21000 Oracle
6 22000 Java


In [77]:
# Display one row from iterator
row = next(df.itertuples(index = True,name='Tution'))
print(row)

Tution(Index=0, Courses='Spark', Fee=20000, Duration='30day')


In [78]:
# Another alternate approach by using DataFrame.apply()
print(df.apply(lambda row: str(row["Fee"]) + " " + str(row["Courses"]), axis = 1))

0      20000 Spark
1    25000 PySpark
2     26000 Hadoop
3     22000 Python
4     24000 pandas
5     21000 Oracle
6       22000 Java
dtype: object


In [79]:
# Another alternate approach by using DataFrame.apply()
print(df.apply(lambda row: str(row["Fee"]) + " " + str(row["Courses"]), axis = 1))

0      20000 Spark
1    25000 PySpark
2     26000 Hadoop
3     22000 Python
4     24000 pandas
5     21000 Oracle
6       22000 Java
dtype: object


In [80]:
# Using DataFrame.index
for idx in df.index:
     print(df['Fee'][idx], df['Courses'][idx])

20000 Spark
25000 PySpark
26000 Hadoop
22000 Python
24000 pandas
21000 Oracle
22000 Java


In [81]:
# Another alternate approach 
# By using DataFrame.loc()
for i in range(len(df)) :
  print(df.loc[i, "Fee"], df.loc[i, "Courses"])

20000 Spark
25000 PySpark
26000 Hadoop
22000 Python
24000 pandas
21000 Oracle
22000 Java


In [83]:
# Another alternate approach 
# By using DataFrame.iloc()
for i in range(len(df)) :
  print(df.iloc[i, 1], df.iloc[i, 0])

20000 Spark
25000 PySpark
26000 Hadoop
22000 Python
24000 pandas
21000 Oracle
22000 Java


In [84]:
# Iterate over column by column 
# Using DataFrame.items()
for label, content in df.items():
    print(f'label: {label}')
    print(f'content: {content}', sep='\n')

label: Courses
content: 0      Spark
1    PySpark
2     Hadoop
3     Python
4     pandas
5     Oracle
6       Java
Name: Courses, dtype: object
label: Fee
content: 0    20000
1    25000
2    26000
3    22000
4    24000
5    21000
6    22000
Name: Fee, dtype: int64
label: Duration
content: 0     30day
1    40days
2    35days
3    40days
4    60days
5    50days
6    55days
Name: Duration, dtype: object


# 9. Pandas - groupby()

In Pandas, you can use groupby() with the combination of sum(), count(), pivot(), transform(), aggregate(), and many more methods to perform various operations on grouped data.

### Syntax

DataFrame.groupby(by=None, axis=0, level=None, as_index=True, 
       sort=True, group_keys=True, squeeze=<no_default>, 
       observed=False, dropna=True)

**by** – List of column names to group by  
**axis** – Default to 0. It takes 0 or ‘index’, 1 or ‘columns’  
**level** – Used with MultiIndex.  
**as_index** – sql style grouped output.  
**sort** – Default to True. Specify whether to sort after the group  
**group_keys** – add group keys or not  
**squeeze** – deprecated in new versions  
**observed** – This only applies if any of the groupers are Categoricals.  
**dropna** – Default to False. Use True to drop None/Nan on sorry keys  

In [85]:
technologies   = ({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas","Hadoop","Spark","Python","NA"],
    'Fee' :[22000,25000,23000,24000,26000,25000,25000,22000,1500],
    'Duration':['30days','50days','55days','40days','60days','35days','30days','50days','40days'],
    'Discount':[1000,2300,1000,1200,2500,None,1400,1600,0]
          })
df = pd.DataFrame(technologies)
print("Create DataFrame:\n", df)

Create DataFrame:
    Courses    Fee Duration  Discount
0    Spark  22000   30days    1000.0
1  PySpark  25000   50days    2300.0
2   Hadoop  23000   55days    1000.0
3   Python  24000   40days    1200.0
4   Pandas  26000   60days    2500.0
5   Hadoop  25000   35days       NaN
6    Spark  25000   30days    1400.0
7   Python  22000   50days    1600.0
8       NA   1500   40days       0.0


In [86]:
# Use groupby() to compute the sum
df2 =df.groupby(['Courses']).sum()
print("Get sum of grouped data:\n", df2)

Get sum of grouped data:
            Fee      Duration  Discount
Courses                               
Hadoop   48000  55days35days    1000.0
NA        1500        40days       0.0
Pandas   26000        60days    2500.0
PySpark  25000        50days    2300.0
Python   46000  40days50days    2800.0
Spark    47000  30days30days    2400.0


In [87]:
# Group by multiple columns
df2 =df.groupby(['Courses', 'Duration']).sum()
print("Get sum of groupby multiple columns:\n", df2)

Get sum of groupby multiple columns:
                     Fee  Discount
Courses Duration                 
Hadoop  35days    25000       0.0
        55days    23000    1000.0
NA      40days     1500       0.0
Pandas  60days    26000    2500.0
PySpark 50days    25000    2300.0
Python  40days    24000    1200.0
        50days    22000    1600.0
Spark   30days    47000    2400.0


In [88]:
# Add Row Index to the group by result
df2 = df.groupby(['Courses','Duration']).sum().reset_index()
print("After adding index to DataFrame:\n", df2)

After adding index to DataFrame:
    Courses Duration    Fee  Discount
0   Hadoop   35days  25000       0.0
1   Hadoop   55days  23000    1000.0
2       NA   40days   1500       0.0
3   Pandas   60days  26000    2500.0
4  PySpark   50days  25000    2300.0
5   Python   40days  24000    1200.0
6   Python   50days  22000    1600.0
7    Spark   30days  47000    2400.0


In [89]:
# Add Row Index to the group by result
df2 = df.groupby(['Courses','Duration']).sum().reset_index()
print("After adding index to DataFrame:\n", df2)

After adding index to DataFrame:
    Courses Duration    Fee  Discount
0   Hadoop   35days  25000       0.0
1   Hadoop   55days  23000    1000.0
2       NA   40days   1500       0.0
3   Pandas   60days  26000    2500.0
4  PySpark   50days  25000    2300.0
5   Python   40days  24000    1200.0
6   Python   50days  22000    1600.0
7    Spark   30days  47000    2400.0


In [91]:
#You can also choose whether to include NA/None/Nan in group keys or not by
#      setting dropna parameter. By default the value of dropna set to True

# Drop rows that have None/Nan on group keys
df2=df.groupby(by=['Courses'], dropna=False).sum()
print(df2)


           Fee      Duration  Discount
Courses                               
Hadoop   48000  55days35days    1000.0
NA        1500        40days       0.0
Pandas   26000        60days    2500.0
PySpark  25000        50days    2300.0
Python   46000  40days50days    2800.0
Spark    47000  30days30days    2400.0


In [92]:
# Sorting group keys on descending order
groupedDF = df.groupby('Courses',sort=False).sum()
sortedDF=groupedDF.sort_values('Courses', ascending=False)
print(sortedDF)

           Fee      Duration  Discount
Courses                               
Spark    47000  30days30days    2400.0
Python   46000  40days50days    2800.0
PySpark  25000        50days    2300.0
Pandas   26000        60days    2500.0
NA        1500        40days       0.0
Hadoop   48000  55days35days    1000.0


In [93]:
# Sorting group keys on descending order
groupedDF = df.groupby('Courses',sort=False).sum()
sortedDF=groupedDF.sort_values('Courses', ascending=True)
print(sortedDF)

           Fee      Duration  Discount
Courses                               
Hadoop   48000  55days35days    1000.0
NA        1500        40days       0.0
Pandas   26000        60days    2500.0
PySpark  25000        50days    2300.0
Python   46000  40days50days    2800.0
Spark    47000  30days30days    2400.0


In [94]:
# Using apply() & lambda
df2 = df.groupby('Courses').apply(lambda x: x.sort_values('Fee'))
print(df2)

           Courses    Fee Duration  Discount
Courses                                     
Hadoop  2   Hadoop  23000   55days    1000.0
        5   Hadoop  25000   35days       NaN
NA      8       NA   1500   40days       0.0
Pandas  4   Pandas  26000   60days    2500.0
PySpark 1  PySpark  25000   50days    2300.0
Python  7   Python  22000   50days    1600.0
        3   Python  24000   40days    1200.0
Spark   0    Spark  22000   30days    1000.0
        6    Spark  25000   30days    1400.0


In [97]:
# Groupby & multiple aggregations
result = df.groupby('Courses')['Fee'].aggregate(['min','max','count','median'])
print("After applying multiple aggregations on grouped data:\n", result)

After applying multiple aggregations on grouped data:
            min    max  count   median
Courses                              
Hadoop   23000  25000      2  24000.0
NA        1500   1500      1   1500.0
Pandas   26000  26000      1  26000.0
PySpark  25000  25000      1  25000.0
Python   22000  24000      2  23000.0
Spark    22000  25000      2  23500.0


In [98]:
# Groupby multiple columns & multiple aggregations
result = df.groupby('Courses').aggregate({'Duration':'count','Fee':['min','max']})
print("After applying multiple aggregations on grouped data:\n", result)

After applying multiple aggregations on grouped data:
         Duration    Fee       
           count    min    max
Courses                       
Hadoop         2  23000  25000
NA             1   1500   1500
Pandas         1  26000  26000
PySpark        1  25000  25000
Python         2  22000  24000
Spark          2  22000  25000


# 10. Pandas - Shuffle DataFrame rows

By using pandas.DataFrame.sample() method you can shuffle the DataFrame rows randomly, if you are using the NumPy module you can use the permutation() method to change the order of the rows also called the shuffle. Python also has other packages like sklearn that has a method shuffle() to shuffle the order of rows in DataFrame.

* Shuffling DataFrame rows helps in randomizing the order of data, which can be crucial for certain statistical analyses and machine learning tasks.
* The DataFrame.sample() method in Pandas provides a convenient way to shuffle DataFrame rows efficiently without modifying the original DataFrame.
* Shuffling DataFrame rows can help in enhancing the diversity of data subsets, thereby improving the generalization ability of machine learning models.
* The DataFrame.sample() method facilitates row shuffling with parameters such as frac to specify the fraction of rows or n to define the exact number of rows to sample.
* To shuffle the DataFrame in place, use the DataFrame.sample() method with the frac=1 parameter.
* For large datasets, shuffling can be memory-intensive, necessitating careful consideration of computational resources, especially in distributed computing environments.

In [99]:
technologies = {
    'Courses':["Spark","PySpark","Hadoop","Python","pandas","Oracle","Java"],
    'Fee' :[20000,25000,26000,22000,24000,21000,22000],
    'Duration':['30days','40days','35days','40days','60days','50days','55days'],
    'Discount':[1000,2300,1500,1200,2500,2100,2000]
               }
df = pd.DataFrame(technologies)
print(df)

   Courses    Fee Duration  Discount
0    Spark  20000   30days      1000
1  PySpark  25000   40days      2300
2   Hadoop  26000   35days      1500
3   Python  22000   40days      1200
4   pandas  24000   60days      2500
5   Oracle  21000   50days      2100
6     Java  22000   55days      2000


In [101]:
# Shuffle the DataFrame rows & return all rows
df1 = df.sample(frac = 1) # frac -> decimal -> None: returns 1 record -> 0.5 -> 50 % rows
print(df1)

   Courses    Fee Duration  Discount
2   Hadoop  26000   35days      1500
4   pandas  24000   60days      2500
3   Python  22000   40days      1200
6     Java  22000   55days      2000
1  PySpark  25000   40days      2300
5   Oracle  21000   50days      2100
0    Spark  20000   30days      1000


In [102]:
# Create a new Index starting from zero
df1 = df.sample(frac = 1).reset_index()
print(df1)

   index  Courses    Fee Duration  Discount
0      2   Hadoop  26000   35days      1500
1      0    Spark  20000   30days      1000
2      6     Java  22000   55days      2000
3      1  PySpark  25000   40days      2300
4      4   pandas  24000   60days      2500
5      5   Oracle  21000   50days      2100
6      3   Python  22000   40days      1200


In [103]:
# Drop shuffle Index
df1 = df.sample(frac = 1).reset_index(drop=True)
print(df1)

   Courses    Fee Duration  Discount
0  PySpark  25000   40days      2300
1     Java  22000   55days      2000
2    Spark  20000   30days      1000
3   pandas  24000   60days      2500
4   Oracle  21000   50days      2100
5   Hadoop  26000   35days      1500
6   Python  22000   40days      1200


In [112]:
# Using sample() method to shuffle DataFrame rows and columns
df2 = df.sample(frac=1, axis=1).sample(frac=1).reset_index(drop=True)
print(df2)

   Discount    Fee Duration  Courses
0      1000  20000   30days    Spark
1      2000  22000   55days     Java
2      2300  25000   40days  PySpark
3      1500  26000   35days   Hadoop
4      2100  21000   50days   Oracle
5      1200  22000   40days   Python
6      2500  24000   60days   pandas


# 11. Pandas - Join

Pandas join() is similar to SQL join where it combines columns from multiple DataFrames based on row indices. **In pandas join can be done only on indexes but not on columns. If you want to join on columns you should use pandas.merge() method** as this by default performs on columns. By default, it uses the left join on the row index.


### Syntax of pandas.DataFrame.join() method
DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)

* other – Pass the right DataFrame object or list of DataFrame objects.
* on – Specify which index you want to join on when you have multiple indexes.
* how – Use to specify the join type. Accepts inner, left, right, outer.
* lsuffix – Specify the left suffix string to column names
* rsuffix – Specify the right suffix string to column names
* sort – To specify the results to be sorted.

In [114]:
technologies = {
    'Courses':["Spark","PySpark","Python","pandas"],
    'Fee' :[20000,25000,22000,30000],
    'Duration':['30days','40days','35days','50days'],
              }
index_labels=['r1','r2','r3','r4']
df1 = pd.DataFrame(technologies,index=index_labels)
print("First DataFrame:\n", df1)
technologies2 = {
    'Courses':["Spark","Java","Python","Go"],
    'Discount':[2000,2300,1200,2000]
              }
index_labels2=['r1','r6','r3','r5']
df2 = pd.DataFrame(technologies2,index=index_labels2)
print('--------------------------')
print("Second DataFRame:\n", df2)

First DataFrame:
     Courses    Fee Duration
r1    Spark  20000   30days
r2  PySpark  25000   40days
r3   Python  22000   35days
r4   pandas  30000   50days
--------------------------
Second DataFRame:
    Courses  Discount
r1   Spark      2000
r6    Java      2300
r3  Python      1200
r5      Go      2000


In [116]:
# Pandas join 
df3=df1.join(df2, lsuffix="_left", rsuffix="_right")
print("After joining two DataFrames:\n", df3)

After joining two DataFrames:
    Courses_left    Fee Duration Courses_right  Discount
r1        Spark  20000   30days         Spark    2000.0
r2      PySpark  25000   40days           NaN       NaN
r3       Python  22000   35days        Python    1200.0
r4       pandas  30000   50days           NaN       NaN


In [117]:
# Pandas Inner join DataFrames
df3=df1.join(df2, lsuffix="_left", rsuffix="_right", how='inner')
print(df3)

   Courses_left    Fee Duration Courses_right  Discount
r1        Spark  20000   30days         Spark      2000
r3       Python  22000   35days        Python      1200


In [118]:
# Pandas Right join DataFrames
df3=df1.join(df2, lsuffix="_left", rsuffix="_right", how='right')
print(df3)

   Courses_left      Fee Duration Courses_right  Discount
r1        Spark  20000.0   30days         Spark      2000
r6          NaN      NaN      NaN          Java      2300
r3       Python  22000.0   35days        Python      1200
r5          NaN      NaN      NaN            Go      2000


In [119]:
# Pandas outer join DataFrames
df3=df1.join(df2, lsuffix="_left", rsuffix="_right", how='outer')
print(df3)

   Courses_left      Fee Duration Courses_right  Discount
r1        Spark  20000.0   30days         Spark    2000.0
r2      PySpark  25000.0   40days           NaN       NaN
r3       Python  22000.0   35days        Python    1200.0
r4       pandas  30000.0   50days           NaN       NaN
r5          NaN      NaN      NaN            Go    2000.0
r6          NaN      NaN      NaN          Java    2300.0


In [120]:
# Pandas join on columns
df3=df1.set_index('Courses').join(df2.set_index('Courses'), how='inner')
print(df3)

           Fee Duration  Discount
Courses                          
Spark    20000   30days      2000
Python   22000   35days      1200


In [121]:
# Pandas join
df3=df1.join(df2.set_index('Courses'), how='inner', on='Courses')
print(df3)

   Courses    Fee Duration  Discount
r1   Spark  20000   30days      2000
r3  Python  22000   35days      1200


# 12. Pandas - merge DataFrames

Pandas support pandas.merge() and DataFrame.merge() to merge DataFrames which is exactly similar to SQL join and supports different types of join inner, left, right, outer, cross. By default, it uses inner join where keys don’t match the rows get dropped from both DataFrames, and the result DataFrame contains rows that match on both.

### Pandas.merge() Syntax
pandas.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)

#### Pandas.DataFrame.merge() Syntax
DataFrame.merge(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)


In [127]:
technologies = {
    'Courses':["Spark","PySpark","Python","pandas"],
    'Fee' :[20000,25000,22000,30000],
    'Duration':['30days','40days','35days','50days'],
              }
index_labels=['r1','r2','r3','r4']
df1 = pd.DataFrame(technologies,index=index_labels)
print("First DataFrame:\n", df1)

technologies2 = {
    'Courses':["Spark","Java","Python","Go"],
    'Discount':[2000,2300,1200,2000]
              }
index_labels2=['r1','r6','r3','r5']
df2 = pd.DataFrame(technologies2,index=index_labels2)
print('-----------------------------')
print("Second DataFrame:\n", df2)

First DataFrame:
     Courses    Fee Duration
r1    Spark  20000   30days
r2  PySpark  25000   40days
r3   Python  22000   35days
r4   pandas  30000   50days
-----------------------------
Second DataFrame:
    Courses  Discount
r1   Spark      2000
r6    Java      2300
r3  Python      1200
r5      Go      2000


In [129]:
# Using pandas.merge()
df3= pd.merge(df1,df2)
print("After merging the two DataFrames:\n", df3)
print('------------------------')
# Using DataFrame.merge()
df3=df1.merge(df2)
print("After merging the two DataFrames:\n", df3)

After merging the two DataFrames:
   Courses    Fee Duration  Discount
0   Spark  20000   30days      2000
1  Python  22000   35days      1200
------------------------
After merging the two DataFrames:
   Courses    Fee Duration  Discount
0   Spark  20000   30days      2000
1  Python  22000   35days      1200


In [130]:
# Merge DataFrames by Columns
df3=pd.merge(df1,df2, on='Courses')

# When column names are different
df3=pd.merge(df1,df2, left_on='Courses', right_on='Courses')
print("After merging the two DataFrames:\n", df3)
print('------------------------')
# Merge DataFrames by Index
df3 = pd.merge(df1,df2,left_index=True,right_index=True)
print(df3)

After merging the two DataFrames:
   Courses    Fee Duration  Discount
0   Spark  20000   30days      2000
1  Python  22000   35days      1200
------------------------
   Courses_x    Fee Duration Courses_y  Discount
r1     Spark  20000   30days     Spark      2000
r3    Python  22000   35days    Python      1200


In [133]:
# Use pandas.merge() to on multiple columns
df3 = pd.merge(df3, df1,  how='left', left_on=['Courses_x','Fee'], right_on = ['Courses','Fee'])
print(df3)

  Courses_x    Fee Duration_x Courses_y  Discount Courses Duration_y
0     Spark  20000     30days     Spark      2000   Spark     30days
1    Python  22000     35days    Python      1200  Python     35days


In [134]:
# Merge by left Join
df3=pd.merge(df1,df2, on='Courses', how='left')
print(df3)

   Courses    Fee Duration  Discount
0    Spark  20000   30days    2000.0
1  PySpark  25000   40days       NaN
2   Python  22000   35days    1200.0
3   pandas  30000   50days       NaN


In [135]:
# Merge by right Join
df3=pd.merge(df1,df2, on='Courses', how='right')
print(df3)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days      2000
1    Java      NaN      NaN      2300
2  Python  22000.0   35days      1200
3      Go      NaN      NaN      2000


In [136]:
# Merge by outer Join
df3=pd.merge(df1,df2, on='Courses', how='outer')
print(df3)

   Courses      Fee Duration  Discount
0    Spark  20000.0   30days    2000.0
1  PySpark  25000.0   40days       NaN
2   Python  22000.0   35days    1200.0
3   pandas  30000.0   50days       NaN
4     Java      NaN      NaN    2300.0
5       Go      NaN      NaN    2000.0


# 13. Pandas - Concat DataFrames

You can use the pandas.concat() function to concatenate or merge two or more pandas DataFrames either along rows or columns.  When concatenating DataFrames along rows, concat() creates a new DataFrame that includes all rows from the input DataFrames, effectively appending one DataFrame to another. Conversely, when concatenating along columns, concat() performs a join operation, combining the DataFrames side-by-side based on their indexes.

### Syntax of concat() function
pandas.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True)


* objs – This is a sequence or mapping of Series or DataFrame objects. If a dictionary is passed, the keys will be used to construct a hierarchical index.   
* axis – {0 or ‘index’, 1 or ‘columns’}, default 0. The axis concatenates along. 0 or 'index' means concatenate along rows (i.e., vertically). 1 or 'columns' means concatenate along columns (i.e., horizontally).    
* join – Type of join to be performed. It can be ‘inner’ or ‘outer’. Defaults to ‘outer’.    
* ignore_index – If True, do not use the index values along the concatenation axis. Defaults to False.    
* keys – Values to associate with the concatenated objects along the concatenation axis. It’s useful for creating a hierarchical index.    
* levels – Specific level(s) (zero-indexed) from the keys to use as index levels.
* names – Names for the levels in the resulting hierarchical index.    
* verify_integrity – If True, check whether the new concatenated axis contains duplicates. Defaults to False.    
* sort – If True, sort the resulting DataFrame by the labels along the concatenation axis. Defaults to False.    
* copy – If False, avoid copying data unnecessarily. Defaults to True.    

In [138]:
df = pd.DataFrame({'Courses': ["Spark","PySpark","Python","pandas"],
                    'Fee' : [20000,25000,22000,24000]})

df1 = pd.DataFrame({'Courses': ["Pandas","Hadoop","Hyperion","Java"],
                    'Fee': [25000,25200,24500,24900]})
print("First DataFrame:\n", df)
print('----------------')
print("Second DataFrame:\n", df1)

First DataFrame:
    Courses    Fee
0    Spark  20000
1  PySpark  25000
2   Python  22000
3   pandas  24000
----------------
Second DataFrame:
     Courses    Fee
0    Pandas  25000
1    Hadoop  25200
2  Hyperion  24500
3      Java  24900


In [139]:
# Using pandas.concat() to concat two DataFrames
data = [df, df1]
df2 = pd.concat(data)
print("After concatenating the two DataFrames:\n", df2)

After concatenating the two DataFrames:
     Courses    Fee
0     Spark  20000
1   PySpark  25000
2    Python  22000
3    pandas  24000
0    Pandas  25000
1    Hadoop  25200
2  Hyperion  24500
3      Java  24900


In [142]:
# Use pandas.concat() method to ignore_index 
df2 = pd.concat([df, df1], ignore_index=True, sort=False)
print(df2)

    Courses    Fee
0     Spark  20000
1   PySpark  25000
2    Python  22000
3    pandas  24000
4    Pandas  25000
5    Hadoop  25200
6  Hyperion  24500
7      Java  24900


# 14. Pandas - .fillna()

pandas.DataFrame.fillna() method is used to fill column (one or multiple columns) containing NA/NaN/None with 0, empty, blank, or any specified values etc. NaN is considered a missing value. When you dealing with machine learning, handling missing values is very important, not handling these will result in a side effect with an incorrect result.

### Syntax of pandas.DataFrame.fillna()
DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)

* value – Takes either scalar, dict, Series, or DataFrame but not list.
* method – Takes one of these values {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}. Default None.
* axis – 0 or ‘index’, 1 or ‘columns’. Used to specify the axis to fill the values.
* inplace – Default False. When used True, it updates the existing DataFrame object.
* limit – Specify how many fills should happen. This is the maximum number of consecutive NaN values replaced with specified values.
* downcast – It takes a dict of key-value pair that specifies data type to downcast. Like Float64 to int64, date to string e.t.c

In [143]:
df = pd.DataFrame(({
     'Courses':["Spark",'Java',"Scala",'Python'],
     'Fee' :[20000,np.nan,26000,24000],
     'Duration':['30days','40days', pd.NA,'40days'],
     'Discount':[1000,np.nan,2500,None]
               }))
print("Create DataFrame:\n", df)

Create DataFrame:
   Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       NaN
2   Scala  26000.0     <NA>    2500.0
3  Python  24000.0   40days       NaN


In [144]:
# Fillna to replace all NaN
df2 = df.fillna('None')
print("After replacing all NAN/NA values with None:\n", df2)

After replacing all NAN/NA values with None:
   Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java     None   40days     None
2   Scala  26000.0     None   2500.0
3  Python  24000.0   40days     None


In [145]:
# Fillna on one column
df2['Discount'] =  df['Discount'].fillna('0')
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java     None   40days        0
2   Scala  26000.0     None   2500.0
3  Python  24000.0   40days        0


In [146]:
# Fillna() on multiple columns
df2[['Discount','Fee']] =  df[['Discount','Fee']].fillna('0')
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java        0   40days        0
2   Scala  26000.0     None   2500.0
3  Python  24000.0   40days        0


In [147]:
# Fillna() on multiple columns
df2 =  df.fillna(value={'Discount':'0','Fee':10000})
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java  10000.0   40days        0
2   Scala  26000.0     <NA>   2500.0
3  Python  24000.0   40days        0


In [148]:
# Fill with limit
df2=df.fillna(value={'Discount':0,'Fee':0},limit=1)
print(df2)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      0.0   40days       0.0
2   Scala  26000.0     <NA>    2500.0
3  Python  24000.0   40days       NaN


# 15. Pandas - .dropna() 

pandas.DataFrame.dropna() is used to drop/remove missing values from rows and columns, np.nan/pd.NaT (Null/None) are considered as missing values. Before we process the data, it is very important to clean up the missing data, as part of cleaning we would be required to identify the rows with Null/NaN/None values and drop them. This dropna() method comes in handy to drop rows with np.nan/pd.NaT values.

### Pandas.DataFrame.dropna() syntax
DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

* pandas.DataFrame.dropna() is used to drop columns with NaN/None values from DataFrame.  
* numpy.nan is Not a Number (NaN), which is of Python build-in numeric type float (floating point).  
* Set axis=1 to drop columns containing NaN values instead of rows.  
* None is of NoneType and it is an object in Python.  
* Use how='all' to remove rows or columns only if every entry is NaN.  
Specify thresh to keep rows or columns that meet a minimum count of non-NaN values.
* Apply dropna() conditionally by specifying columns in subset where non-NaN values are required.  

In [149]:
technologies = {
    'Courses':["Spark","PySpark","Hadoop","Python","pandas",np.nan],
    'Fee' :[20000,25000,26000,23093,24000,np.nan],
    'Duration':['30day','40days','35days','45days',np.nan,np.nan],
    'Discount':[1000,np.nan,1200,2500,pd.NaT,np.nan],
    '':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
              }
index_labels=['r1','r2','r3','r4','r5','']
df = pd.DataFrame(technologies,index=index_labels)
print(df)

    Courses      Fee Duration Discount    
r1    Spark  20000.0    30day     1000 NaN
r2  PySpark  25000.0   40days      NaN NaN
r3   Hadoop  26000.0   35days     1200 NaN
r4   Python  23093.0   45days     2500 NaN
r5   pandas  24000.0      NaN      NaT NaN
        NaN      NaN      NaN      NaN NaN


In [150]:
# Drop rows that has all Nan Values
df = df.dropna(how='all')
print(df)

    Courses      Fee Duration Discount    
r1    Spark  20000.0    30day     1000 NaN
r2  PySpark  25000.0   40days      NaN NaN
r3   Hadoop  26000.0   35days     1200 NaN
r4   Python  23093.0   45days     2500 NaN
r5   pandas  24000.0      NaN      NaT NaN


In [151]:
# Drop columns that has all Nan Values
df = df.dropna(how='all',axis=1)
print(df)

    Courses      Fee Duration Discount
r1    Spark  20000.0    30day     1000
r2  PySpark  25000.0   40days      NaN
r3   Hadoop  26000.0   35days     1200
r4   Python  23093.0   45days     2500
r5   pandas  24000.0      NaN      NaT


In [152]:
# Drop rows that contains nan values
df2=df.dropna()
print(df2)

   Courses      Fee Duration Discount
r1   Spark  20000.0    30day     1000
r3  Hadoop  26000.0   35days     1200
r4  Python  23093.0   45days     2500


In [154]:
# Drop columns that contains nan values
df2=df.dropna(axis=1)
print(df2)

    Courses      Fee
r1    Spark  20000.0
r2  PySpark  25000.0
r3   Hadoop  26000.0
r4   Python  23093.0
r5   pandas  24000.0


In [155]:
# Drop rows that has NaN values on selected columns
df2=df.dropna(subset=['Courses','Duration'])
print(df2)

    Courses      Fee Duration Discount
r1    Spark  20000.0    30day     1000
r2  PySpark  25000.0   40days      NaN
r3   Hadoop  26000.0   35days     1200
r4   Python  23093.0   45days     2500


In [157]:
# With threshold, 
# Keep only the rows with at least 2 non-NA values.
df2=df.dropna(thresh=2)