In [4]:
%pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Import pandas
import pandas as pd

## DataFrames
A DataFrame is a two-dimensional labeled data structure with columns of potentially 
different data types, similar to a spreadsheet or SQL table. 
It provides a powerful and flexible way to manipulate and analyze structured data in Python, 
offering functionalities for data analysis.

In [6]:
# Creating an empty DataFrame
df = pd.DataFrame()
df

In [7]:
# Creating DataFrame using lists of list

row_data =[["Alice",23],["Michael",24],["Lucas",19]]
df = pd.DataFrame(row_data,columns=["Name","Age"])
df

Unnamed: 0,Name,Age
0,Alice,23
1,Michael,24
2,Lucas,19


In [8]:
# Creating a DataFrame using Dictionary of Lists
data = {
    "Name" : ["Alice", "Michael", "Lucas"],
    "Age"  : [23,25,19]
}
pd.DataFrame(data)

Unnamed: 0,Name,Age
0,Alice,23
1,Michael,25
2,Lucas,19


In [9]:
# Creating a DataFrame using a list of Dictionaries
data = [
    {'Name':'Alice','Age':23},
    {'Name':'Michael','Age':25},
    {'Name':'Lucas','Age':19}
]

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,23
1,Michael,25
2,Lucas,19


## Series

A pandas Series is a one-dimensional labeled array capable of 
holding data of any type (integer, string, float, etc.). 
It's similar to a one-column table or an array with associated labels, 
providing powerful indexing and manipulation capabilities in Python.

In [10]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [11]:
s = pd.Series([1,2,"Three",4,"Five"])
s

0        1
1        2
2    Three
3        4
4     Five
dtype: object

pandas. (64 bit integer)
- Float (float64): Represents numbers with decimals (e.g., 3.14, -12.5).
- Boolean (bool): Represents logical True or False values.
- Object: This is a versatile but less efficient type that can store various data types 
like strings, lists, or custom objects. 
    Pandas uses this type when it cannot infer a more specific data type.

In [12]:
# Integer ( int64)
integer_series = pd.Series([1,2,3,4,5])
integer_series

# Float (float64)
float_series = pd.Series([3.14,-3.14,-2.01,2.01])
float_series

# Boolean ( Either True or False)
boolean_series = pd.Series([True,False])
boolean_series

# Object
object_Series = pd.Series([1,2,"Three",4,"Five"])
object_Series

0        1
1        2
2    Three
3        4
4     Five
dtype: object

In [13]:
# Integer (int64)
integer_series = pd.Series([1, 2, 3, 4, 5])
integer_series
# Float (float64)
float_series = pd.Series([3.14, -3.14, 2.01, -2.01])
float_series
# Boolean (bool) (Either True or False)
boolean_series = pd.Series([True, False])
boolean_series
# Object (object)
object_series = pd.Series([1, 2, "Three", 4, "Five"])
object_series

0        1
1        2
2    Three
3        4
4     Five
dtype: object

##

Specialized Data Types:
- Datetime (datetime64[ns]): Represents dates and times with nanosecond precision. 
    Useful for time-series data analysis.
- Timedelta (timedelta64[ns]): Represents durations between timestamps.
- Categorical: Represents categorical data with predefined categories. 
    Efficient for storing limited sets of categories.
- Sparse: Represents sparse data with many missing values. 
    Stores data efficiently by only keeping non-zero values.

### Datetime (datetime64[ns])

In [14]:
datetime_series = pd.Series([pd.to_datetime('2024-04-05'),
                            pd.to_datetime('2024-05-05'),
                            pd.to_datetime('2024-06-05')])
datetime_series

0   2024-04-05
1   2024-05-05
2   2024-06-05
dtype: datetime64[ns]

### Timedelta (timedelta64[ns])

In [15]:
tiemdelta_series = pd.Series([pd.Timedelta(days=8,hours =3,minutes =30),
                    pd.Timedelta(days=4,hours =3,minutes =20),
                    pd.Timedelta(days=2,hours =3,minutes =10)])
tiemdelta_series

0   8 days 03:30:00
1   4 days 03:20:00
2   2 days 03:10:00
dtype: timedelta64[ns]

### Categorical

In [16]:
categorical_series = pd.Series(pd.Categorical(["Sales","Marketing","Operations"]))
categorical_series

0         Sales
1     Marketing
2    Operations
dtype: category
Categories (3, object): ['Marketing', 'Operations', 'Sales']

### Sparse

In [17]:
sparse_series = pd.Series(pd.arrays.SparseArray([1,2,3,pd.NA,4,5]))
sparse_series

0      1
1      2
2      3
3    NaN
4      4
5      5
dtype: Sparse[object, nan]

In [18]:
sparse_series = pd.Series(pd.arrays.SparseArray([1,2,3,0,4,5,0]))
sparse_series

0    1
1    2
2    3
3    0
4    4
5    5
6    0
dtype: Sparse[int64, 0]

In [19]:
# Check the data type
integer_series.dtype

dtype('int64')

### Changing Data Types

In [20]:
# From integer to float
converted_from_integer_to_float = integer_series.astype('float64')
converted_from_integer_to_float

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [21]:
float_series

0    3.14
1   -3.14
2    2.01
3   -2.01
dtype: float64

In [22]:
# From float to Integer
converted_from_float_to_integer = float_series.astype('int64')
converted_from_float_to_integer

0    3
1   -3
2    2
3   -2
dtype: int64

**Example: Sales Data Analysis**

You have a dataset of sales transactions that includes the product name, quantity sold, and sale price. 
You want to analyze the data to find the total revenue per product.

In [23]:
data = {
    'Product Name':['A','B','C','A','B','A'],
    'Quantity Sold':[3,2,5,4,1,2],
    'Sale Price':[10,20,10,15,20,15]
}
sales_df = pd.DataFrame(data)
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price
0,A,3,10
1,B,2,20
2,C,5,10
3,A,4,15
4,B,1,20
5,A,2,15


In [24]:
# Getting the Product Name Column
sales_df['Product Name']

0    A
1    B
2    C
3    A
4    B
5    A
Name: Product Name, dtype: object

In [25]:
# Operation in Pandas
sales_df["Total Revenue"] = sales_df["Quantity Sold"]* sales_df["Sale Price"]
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60
4,B,1,20,20
5,A,2,15,30


In [26]:
# Get the overall revenue
print("Overall Revenue:",sum(sales_df["Total Revenue"]))

Overall Revenue: 230


In [27]:
result_df = pd.DataFrame()

result_df['Total Revenue']=sales_df.groupby("Product Name")['Total Revenue'].sum()
result_df

Unnamed: 0_level_0,Total Revenue
Product Name,Unnamed: 1_level_1
A,120
B,60
C,50


In [28]:
result_df = pd.DataFrame()

# Getting the Total Revenue
result_df['Total Revenue']=sales_df.groupby("Product Name")['Total Revenue'].sum()
result_df

# Getting the Total Quantity Sold
result_df['Total Quantity Sold'] = sales_df.groupby("Product Name")['Quantity Sold'].sum()
result_df

Unnamed: 0_level_0,Total Revenue,Total Quantity Sold
Product Name,Unnamed: 1_level_1,Unnamed: 2_level_1
A,120,9
B,60,3
C,50,5


In [29]:
result_df = pd.DataFrame()
# Getting the Total Revenue
result_df['Total Revenue'] = sales_df.groupby('Product Name')['Total Revenue'].sum()
# Getting the Total Quantity Sold
result_df['Total Quantity Sold'] = sales_df.groupby('Product Name')['Quantity Sold'].sum()
result_df

Unnamed: 0_level_0,Total Revenue,Total Quantity Sold
Product Name,Unnamed: 1_level_1,Unnamed: 2_level_1
A,120,9
B,60,3
C,50,5


### **Data Selection**

Pandas provides numerous methods for selecting and indexing data in Series and DataFrames, 
including label-based indexing with .loc, integer-position based indexing with .iloc, and conditional selection.

In [30]:
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60
4,B,1,20,20
5,A,2,15,30


In [31]:
#[starting_index: ending_index(exclusive):step/traversal method]
sales_df['Product Name'][0:2]

0    A
1    B
Name: Product Name, dtype: object

In [32]:
sales_df['Quantity Sold'][2:5]

2    5
3    4
4    1
Name: Quantity Sold, dtype: int64

In [33]:
sales_df['Sale Price'][::2]

0    10
2    10
4    20
Name: Sale Price, dtype: int64

In [34]:
sales_df['Sale Price'][::2].sum()

np.int64(40)

#### Index Location (.iloc)
- Will get rows based on a number/index.
- Will output into a DataFrame instead of a Series.

In [35]:
# Getting the first 3 rows
#[starting_index: ending_index(exclusive):step/traversal method]
sales_df.iloc[:3]

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50


In [36]:
# Getting the first 3 rows
#[starting_index: ending_index(exclusive):step/traversal method]
sales_df[:3]

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50


#### Location (.loc)
- Access a group of rows and columns by label(s) or a boolean array.

In [37]:
# [starting_index:ending_index(inclusive):step/traversal method]
sales_df.loc[:2, ['Quantity Sold', 'Sale Price', 'Total Revenue']]


Unnamed: 0,Quantity Sold,Sale Price,Total Revenue
0,3,10,30
1,2,20,40
2,5,10,50


### Conditional Filtering

In [38]:
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60
4,B,1,20,20
5,A,2,15,30


In [39]:
# Rows which have a Product Name == A
sales_df[sales_df['Product Name'] == "A"]

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
0,A,3,10,30
3,A,4,15,60
5,A,2,15,30


In [40]:
sales_df[sales_df['Total Revenue']>= 40]

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue
1,B,2,20,40
2,C,5,10,50
3,A,4,15,60


## Apply

The apply function in pandas is a powerful tool for working with DataFrames. 
It allows you to apply a custom function to each element (row or column) of the DataFrame 
and return a new DataFrame or Series based on the results.

In [41]:
def discount(original_price):
    discount_rate = 0.10
    discounted_amount = original_price * discount_rate
    discounted_price = original_price - discounted_amount
    return discounted_price
sales_df['10% Discounted Price'] = sales_df['Sale Price'].apply(discount)
sales_df

Unnamed: 0,Product Name,Quantity Sold,Sale Price,Total Revenue,10% Discounted Price
0,A,3,10,30,9.0
1,B,2,20,40,18.0
2,C,5,10,50,9.0
3,A,4,15,60,13.5
4,B,1,20,20,18.0
5,A,2,15,30,13.5


## Pandas Operators

Data Loading and Exploration:

- head(): Shows the first few rows of a DataFrame
- tail(): Shows the last few rows of a DataFrame
- describe(): Generates summary statistics for each column (mean, standard deviation, etc.)
- info(): Displays information about the DataFrame, including data types and memory usage

Data Analysis:

- sum(): Calculates the sum of a Series or DataFrame
- mean(): Calculates the mean of a Series or DataFrame
- median(): Calculates the median of a Series or DataFrame
- std(): Calculates the standard deviation of a Series or DataFrame
- var(): Calculates the variance of a Series or DataFrame

In [42]:
reviews_data = {
    'ProductID': ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10'],
    'Rating': [5, 3, 2, 3, 4, 5, 2, 4, 3, 1]
}

reviews_df = pd.DataFrame(reviews_data)
reviews_df

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,3
4,P5,4
5,P6,5
6,P7,2
7,P8,4
8,P9,3
9,P10,1


In [43]:
# First five rows by default
reviews_df.head()

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,3
4,P5,4


In [44]:
# First three rows 
reviews_df.head(3)

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2


In [45]:
# Last five rows by default
reviews_df.tail()

Unnamed: 0,ProductID,Rating
5,P6,5
6,P7,2
7,P8,4
8,P9,3
9,P10,1


In [46]:
# Last three rows 
reviews_df.tail(3)

Unnamed: 0,ProductID,Rating
7,P8,4
8,P9,3
9,P10,1


In [47]:
reviews_df.describe()

Unnamed: 0,Rating
count,10.0
mean,3.2
std,1.316561
min,1.0
25%,2.25
50%,3.0
75%,4.0
max,5.0


In [48]:
reviews_df

Unnamed: 0,ProductID,Rating
0,P1,5
1,P2,3
2,P3,2
3,P4,3
4,P5,4
5,P6,5
6,P7,2
7,P8,4
8,P9,3
9,P10,1


In [49]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ProductID  10 non-null     object
 1   Rating     10 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 292.0+ bytes


In [50]:
# Sum
reviews_df['Rating'].sum()

np.int64(32)

In [51]:
# Mean
reviews_df['Rating'].mean()

np.float64(3.2)

In [52]:
# Standard Deviation
reviews_df['Rating'].std()

np.float64(1.3165611772087666)

In [53]:
# Variance
reviews_df['Rating'].var()

np.float64(1.7333333333333334)

### **Importing and Exporting Data**

Pandas supports reading from and writing to a variety of file formats, 
including CSV, Excel, SQL, making it easy to integrate with data analysis workflows.

In [54]:
# Turns our CSV into a DataFrame
data = pd.read_csv('example.csv')
data

Unnamed: 0,A,B,C
0,1.0,5.0,10.0
1,2.0,6.5,11.0
2,2.333333,6.5,12.0
3,4.0,8.0,11.0


In [55]:
# This lets us export to excel
%pip install openpyxl



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [56]:
data.to_excel('exported_dataframe_to_excel.xlsx', sheet_name="Example Sheet", index=False)

In [57]:
data.to_csv('exported_dataframe_to_csv.csv', index =False)

In [58]:
# Turns our CSV into a DataFrame
df = pd.read_csv('adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## **Data Inspection** 

Data inspection is the initial review of a dataset to find missing values, 
incorrect data types, and gather basic statistics, providing insights into its quality and structure.

In [59]:
# Identify the missing values

# True = Empty, False = Not Empty

df.isnull()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48838,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48839,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
48840,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [60]:
# True = 1, False = 0
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [61]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [62]:
(df == '?').sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [63]:
# Identify the Data Types of each Column
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

In [64]:
# Initial Analysis
df.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


## **Cleaning Data**

Cleaning data involves eliminating or rectifying inaccuracies, inconsistencies, 
and missing values within your dataset, utilizing techniques such as handling 
missing values via deletion or imputation, rectifying data types, and detecting 
and eliminating duplicate entries, ultimately resulting in more precise and dependable analysis.

In [65]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [66]:
# inplace = True, if you want to apply the change to the current DataFrame, False - to create a copy
df.replace('?',pd.NA,inplace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [67]:
# Check for rows that have null values
df.isnull().sum()


age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [82]:
# fillna = replace the null values
df['occupation'] = df['occupation'].fillna("Unemployed")
df['occupation']

0        Machine op inspct
1          Farming fishing
2          Protective serv
3        Machine op inspct
5            Other service
               ...        
48837         Tech support
48838    Machine op inspct
48839         Adm clerical
48840         Adm clerical
48841      Exec managerial
Name: occupation, Length: 45222, dtype: object

In [79]:
# Drop missing values.
df.dropna(inplace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [83]:
df['occupation'] = df['occupation'].replace('-', ' ', regex=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine op inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine op inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine op inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm clerical,Own-child,White,Male,0,0,20,United-States,<=50K
