In [1]:
#importting required libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import requests
import re
#pd.set_option('display.max_rows', None)

In [2]:
#loading data
data = pd.read_csv("billionare_file.csv")
female_data = pd.read_csv("female_billionares.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
0,0,https://specials-images.forbesimg.com/imageser...,1.0,Elon Musk,$242.7 B,52.0,"Tesla, SpaceX",United States,https://www.forbes.com/profile/elon-musk/?list...,Automotive
1,1,https://specials-images.forbesimg.com/imageser...,41.0,Robin Zeng,$32.7 B,54.0,Batteries,Hong Kong,https://www.forbes.com/profile/robin-zeng/?lis...,Automotive
2,2,https://specials-images.forbesimg.com/imageser...,57.0,Stefan Quandt,$26.6 B,57.0,BMW,Germany,https://www.forbes.com/profile/stefan-quandt/?...,Automotive
3,3,https://specials-images.forbesimg.com/imageser...,61.0,Susanne Klatten,$25.3 B,61.0,"BMW, pharmaceuticals",Germany,https://www.forbes.com/profile/susanne-klatten...,Automotive
4,4,//specials-images.forbesimg.com/imageserve/5dc...,84.0,Wang Chuanfu,$19.8 B,57.0,"Batteries, automobiles",China,https://www.forbes.com/profile/wang-chuanfu/?l...,Automotive


In [4]:
female_data.head()

Unnamed: 0.1,Unnamed: 0,female_name
0,0,Francoise Bettencourt Meyers & family
1,1,Alice Walton
2,2,Julia Koch & family
3,3,Jacqueline Mars
4,4,Miriam Adelson & family


# Summary of the Data

This Data is about more than 2600 Billionaires that is listed on "Forbes Real Time Billionaire" of year 2023. This Dataset provides some information about billionaires. 

## Columns Description

#### **Table** -> `data`:

- `Image` - Image of billionaires in small size as link.

- `Rank` -  Billionaires's rank based on their worth value calculated by forbes.

- `Name` - Name of the billionaire.

- `Net-Worth` - This column contains the net worth of billionaire in real-time in billions.

- `Age` - Age of the billionaire.

- `Source` - This column hold company name through which they earn money.

- `Country` - In which country they belong to or their company.

- `internal_link` - This column has links that holds some extra information about the billionaires. Further this column will be used to scrape the extra information about billionaires.

- `Industry` - Their company belong to which industry.

#### **Table** -> `female_data`:

**female_data** only contains billionaires name who is female, so that I can distinguish between male and female billionaires. For this, in **female_data**, I will create a new column called **Sex** and value will be female.

In [5]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
0,0,https://specials-images.forbesimg.com/imageser...,1.0,Elon Musk,$242.7 B,52.0,"Tesla, SpaceX",United States,https://www.forbes.com/profile/elon-musk/?list...,Automotive


### Issues with Dataset

#### 1. table -> data
   - `Image Column` - Some of the link are not complete, 'https://' are missing.
   
   - `Name column` - In name column, some rows contains **& family** with their name in last. - `Dirty Data`
   
   - `Rank Column` - This column not in order (in Ascending). It is random.
   
   - `Net_Worth` - B and dollar sign is included in net_worth value. - `Dirty Data`
   
   - `Age` - This column has some missing values.
   
   - There are some null values, which came during scraping, there is no any billionaires data missing. So, I will drop it later. - `Dirty Data`
   
#### 2. table -> female_data
   - `Name Column` - In name column, some rows contains **& family** with their name in last. - `Dirty Data`
   - **Sex** column is not there, I will create it. 

## Data Cleaning

#### Table = data

- Remove null vaues
- Add "'https://" in incomplete rows in image column.
- make rank column in ascending order.
- Remove **& family** from name column.
- Remove dollar sign and B from net_worth column.
- Fill age value as much as possible in age column.
- Seperate "internal_link" column from this dataset.
- Incorrect data type assigned to rank, net_worth, and age column. Convert it.


#### Table = female_data

- Remove **& family** from name column.
- Create **Sex** column with value **Female**
- This dataset should not be independent, merge it with data.

### 1. Table --> Data

### Remove null vaues

In [6]:
#Checking for null values
data.isnull().sum()
# As you can see, most of the columns have same number of null values, this null values came during scraping.
# Here, no data of any billionares are missing. So, I will remove this null value. i.e. |255.

Unnamed: 0         0
Image            255
Rank             255
Name             255
Net_Worth        255
Age              332
Source           255
Country          255
internal_link    255
Industry           0
dtype: int64

In [7]:
female_data.isnull().sum()
# As you can see, most of the columns have same number of null values here also, this null values came during scraping.
# Here, no data of any billionares are missing. So, I will remove this null value. i.e. 33.

Unnamed: 0      0
female_name    33
dtype: int64

In [8]:
data[data['Image'].isnull()]

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
10,10,,,,,,,,,Automotive
21,21,,,,,,,,,Automotive
32,32,,,,,,,,,Automotive
43,43,,,,,,,,,Automotive
54,54,,,,,,,,,Automotive
...,...,...,...,...,...,...,...,...,...,...
2819,2819,,,,,,,,,Technology
2830,2830,,,,,,,,,Technology
2841,2841,,,,,,,,,Telecom
2852,2852,,,,,,,,,Telecom


In [9]:
data = data[data['Image'].notnull()]
female_data = female_data[female_data['female_name'].notnull()]

In [10]:
data.isnull().sum()
# Now only 77 value from age is missing.

Unnamed: 0        0
Image             0
Rank              0
Name              0
Net_Worth         0
Age              77
Source            0
Country           0
internal_link     0
Industry          0
dtype: int64

In [11]:
female_data.isnull().sum()
# No missing values

Unnamed: 0     0
female_name    0
dtype: int64

### Add "'https://" in incomplete rows in Image Column.

In [12]:
data['Image']
# In this column, you will notice that some links are pure links, but some links are incomplete. "https: is missing."
# let add it

0       https://specials-images.forbesimg.com/imageser...
1       https://specials-images.forbesimg.com/imageser...
2       https://specials-images.forbesimg.com/imageser...
3       https://specials-images.forbesimg.com/imageser...
4       //specials-images.forbesimg.com/imageserve/5dc...
                              ...                        
2858    https://specials-images.forbesimg.com/imageser...
2859    //specials-images.forbesimg.com/imageserve/5ab...
2860    //specials-images.forbesimg.com/imageserve/5a8...
2861    //specials-images.forbesimg.com/imageserve/5ab...
2862    //specials-images.forbesimg.com/imageserve/5ab...
Name: Image, Length: 2609, dtype: object

In [13]:
def full_link(value):
    if value.startswith('https:'):
        return value
    else:
        return 'https:' + value

In [14]:
data['Image'] = data['Image'].apply(full_link)

In [15]:
data['Image']

0       https://specials-images.forbesimg.com/imageser...
1       https://specials-images.forbesimg.com/imageser...
2       https://specials-images.forbesimg.com/imageser...
3       https://specials-images.forbesimg.com/imageser...
4       https://specials-images.forbesimg.com/imageser...
                              ...                        
2858    https://specials-images.forbesimg.com/imageser...
2859    https://specials-images.forbesimg.com/imageser...
2860    https://specials-images.forbesimg.com/imageser...
2861    https://specials-images.forbesimg.com/imageser...
2862    https://specials-images.forbesimg.com/imageser...
Name: Image, Length: 2609, dtype: object

### make rank column in ascending order and correct data type

In [16]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
0,0,https://specials-images.forbesimg.com/imageser...,1.0,Elon Musk,$242.7 B,52.0,"Tesla, SpaceX",United States,https://www.forbes.com/profile/elon-musk/?list...,Automotive
1,1,https://specials-images.forbesimg.com/imageser...,41.0,Robin Zeng,$32.7 B,54.0,Batteries,Hong Kong,https://www.forbes.com/profile/robin-zeng/?lis...,Automotive


In [17]:
data.sort_values('Rank', inplace=True)

In [18]:
data.reset_index(inplace=True) # reseting index value
data.drop(columns=['index'], inplace=True)

In [19]:
# converting float64 to int16
data['Rank'] = data['Rank'].astype('Int16')

### Remove '**& family**' from name column.

In [20]:
data[data['Name'].str.contains('& family')].head()

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
1,439,https://specials-images.forbesimg.com/imageser...,2,Bernard Arnault & family,$224.1 B,74.0,LVMH,France,https://www.forbes.com/profile/bernard-arnault...,Fashion & Retail
8,2831,https://specials-images.forbesimg.com/imageser...,9,Carlos Slim Helu & family,$100.6 B,83.0,Telecom,Mexico,https://www.forbes.com/profile/carlos-slim-hel...,Telecom
14,441,https://specials-images.forbesimg.com/imageser...,15,Francoise Bettencourt Meyers & family,$89.0 B,70.0,L'Oréal,France,https://www.forbes.com/profile/francoise-bette...,Fashion & Retail
21,132,https://specials-images.forbesimg.com/imageser...,21,Julia Koch & family,$57.9 B,61.0,Koch Industries,United States,https://www.forbes.com/profile/julia-koch/?lis...,Diversified
22,1994,https://specials-images.forbesimg.com/imageser...,23,David Thomson & family,$56.6 B,66.0,Media,Canada,https://www.forbes.com/profile/david-thomson/?...,Media & Entertainment


In [21]:
data['Name'] = data['Name'].str.replace('& family','').str.strip()

### Remove dollar sign and B from net_worth column and correct data type

In [22]:
data['Net_Worth']

0       $242.7 B
1       $224.1 B
2       $149.8 B
3       $144.5 B
4       $118.0 B
          ...   
2604      $1.0 B
2605      $1.0 B
2606      $0.4 B
2607      $0.0 B
2608      $0.0 B
Name: Net_Worth, Length: 2609, dtype: object

In [23]:
data['Net_Worth'] = data['Net_Worth'].str.replace('$','').str.replace('B','').str.strip().astype(float)

### Fill age value as much as possible in Age column and correct data type

In [24]:
# These are the names, whose age is not available. I will try to fill it.
data[data['Age'].isnull()]['Name']

102      Karl Albrecht Jr.
103          Beate Heister
254     Francine von Finck
423          Douglas Leone
448            Zhou Qunfei
               ...        
2487      Francesco Saputo
2489             Wang Fuji
2564           Cai Hongbin
2578        Michael McCain
2584           Cho Jyh-jer
Name: Name, Length: 77, dtype: object

In [25]:
# lets fill nan values
# Age Column is important, so I will use beautifulsoup to fill it by searching it one google.
for index, row in data.iterrows():
    if row['Age'] is None or pd.isna(row['Age']):
        search_name = row['Name']
        search_query = f"{search_name} age"
        #print(search_query)
        response = requests.get(f"https://www.google.com/search?q={search_query}")
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            data.loc[index,'Age'] = soup.find('div', {'class': 'AVsepf'}).text
            print(soup.find('div', {'class': 'AVsepf'}).text)
        except:
            data.loc[index,'Age'] = np.nan
            print(np.nan)   

Born: 1948 (age 75 years), Essen, Germany
Born: 5 October 1951 (age 71 years), Essen, Germany
Born: 2 May 1968 (age 55 years), Munich, Germany
Born: 4 July 1957 (age 66 years), Genoa, Italy
Born: 1970 (age 53 years), Xiangxiang, Xiangtan, China
Born: 28 January 1955 (age 68 years), New Delhi
Born: 12 September 1954 (age 68 years), Cardiff, United Kingdom
Born: 11 October 1967 (age 55 years), Frankfurt, Germany
Born: 31 October 1971 (age 51 years), Buenos Aires, Argentina
nan
Born: 1930 (age 93 years)
nan
nan
Born: 9 June 1975 (age 48 years), Hamburg, Germany
nan
nan
nan
Born: 4 November 1969 (age 53 years), Bielefeld, Germany
nan
nan
nan
Born: 1968 (age 55 years)
Born: 11 August 1946 (age 76 years)
nan
nan
nan
nan
Parent organization: Lisa Dräxlmaier GmbH
Born: 3 May 1952 (age 71 years), Suresnes, France
nan
nan
nan
nan
nan
nan
Born: 15 October 1975 (age 47 years)
nan
nan
nan
nan
nan
nan
nan
nan
Born: 24 January 1979 (age 44 years), Shanghai, China
nan
nan
nan
nan
nan
nan
nan
nan
nan
n

In [26]:
data['Age'].unique()

array([52.0, 74.0, 59.0, 78.0, 67.0, 92.0, 39.0, 83.0, 50.0, 66.0, 81.0,
       49.0, 87.0, 70.0, 75.0, 68.0, 73.0, 58.0, 61.0, 85.0, 31.0, 60.0,
       95.0, 86.0, 77.0, 51.0, 53.0, 54.0, 72.0, 76.0, 69.0, 88.0, 64.0,
       82.0, 57.0, 65.0, 43.0, 80.0, 62.0, 84.0, 36.0, 90.0, 91.0,
       'Born: 1948 (age 75\xa0years), Essen, Germany',
       'Born: 5 October 1951 (age 71\xa0years), Essen, Germany', 56.0,
       41.0, 55.0, 94.0, 47.0, 93.0, 96.0, 71.0, 79.0, 89.0, 38.0, 99.0,
       63.0, 45.0, 46.0, 48.0, 44.0, 40.0,
       'Born: 2 May 1968 (age 55\xa0years), Munich, Germany', 42.0,
       'Born: 4 July 1957 (age 66\xa0years), Genoa, Italy',
       'Born: 1970 (age 53\xa0years), Xiangxiang, Xiangtan, China',
       'Born: 28 January 1955 (age 68\xa0years), New Delhi', 32.0, 34.0,
       'Born: 12 September 1954 (age 68\xa0years), Cardiff, United Kingdom',
       'Born: 11 October 1967 (age 55\xa0years), Frankfurt, Germany',
       'Born: 31 October 1971 (age 51\xa0years), Buenos 

In [27]:
# converting to str so that I can apply string operation to split and fetch age.
data['Age'] = data['Age'].astype(str)

In [28]:
data['Age'] = data['Age'].str.split('age').str.get(-1).str.split('years').str.get(0).str.strip()

In [29]:
data['Age'].unique()

array(['52.0', '74.0', '59.0', '78.0', '67.0', '92.0', '39.0', '83.0',
       '50.0', '66.0', '81.0', '49.0', '87.0', '70.0', '75.0', '68.0',
       '73.0', '58.0', '61.0', '85.0', '31.0', '60.0', '95.0', '86.0',
       '77.0', '51.0', '53.0', '54.0', '72.0', '76.0', '69.0', '88.0',
       '64.0', '82.0', '57.0', '65.0', '43.0', '80.0', '62.0', '84.0',
       '36.0', '90.0', '91.0', '75', '71', '56.0', '41.0', '55.0', '94.0',
       '47.0', '93.0', '96.0', '71.0', '79.0', '89.0', '38.0', '99.0',
       '63.0', '45.0', '46.0', '48.0', '44.0', '40.0', '55', '42.0', '66',
       '53', '68', '32.0', '34.0', '51', 'nan', '21.0', '19.0', '28.0',
       '37.0', '35.0', '93', '33.0', '48', '30.0', '20.0', '98.0', '76',
       '100.0', 'Parent organization: Lisa Dräxlmaier GmbH', '97.0', '47',
       '26.0', '44', '101.0',
       'Born: 16 March 1727, Dudley, United Kingdom', '27.0', '73',
       '29.0', '64'], dtype=object)

In [30]:
# After splitting and filling age, two row's values are still incorrect. Lets correct it
# 1.
data[data['Age'] == 'Parent organization: Lisa Dräxlmaier GmbH']
# 

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
1471,52,https://specials-images.forbesimg.com/imageser...,1472,Fritz Draexlmaier,2.1,Parent organization: Lisa Dräxlmaier GmbH,Auto parts,Germany,https://www.forbes.com/profile/fritz-draexlmai...,Automotive


By searching this name **"Fritz Draexlmaier"** manually on google I found that, his age is 65 years old. In future, You will have to again maually fill it, if age changes.

In [31]:
data.loc[1471,'Age'] = '65'

In [32]:
2.
data[data['Age'] == 'Born: 16 March 1727, Dudley, United Kingdom']
# Information fetched is wrong.

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
2308,1095,https://specials-images.forbesimg.com/imageser...,2308,Catherine Phillips,1.2,"Born: 16 March 1727, Dudley, United Kingdom",investments,Canada,https://www.forbes.com/profile/catherine-phill...,Finance & Investments


In [33]:
data.loc[2308,'Age'] = np.nan

In [34]:
data['Age'].unique()

array(['52.0', '74.0', '59.0', '78.0', '67.0', '92.0', '39.0', '83.0',
       '50.0', '66.0', '81.0', '49.0', '87.0', '70.0', '75.0', '68.0',
       '73.0', '58.0', '61.0', '85.0', '31.0', '60.0', '95.0', '86.0',
       '77.0', '51.0', '53.0', '54.0', '72.0', '76.0', '69.0', '88.0',
       '64.0', '82.0', '57.0', '65.0', '43.0', '80.0', '62.0', '84.0',
       '36.0', '90.0', '91.0', '75', '71', '56.0', '41.0', '55.0', '94.0',
       '47.0', '93.0', '96.0', '71.0', '79.0', '89.0', '38.0', '99.0',
       '63.0', '45.0', '46.0', '48.0', '44.0', '40.0', '55', '42.0', '66',
       '53', '68', '32.0', '34.0', '51', 'nan', '21.0', '19.0', '28.0',
       '37.0', '35.0', '93', '33.0', '48', '30.0', '20.0', '98.0', '76',
       '100.0', '65', '97.0', '47', '26.0', '44', '101.0', nan, '27.0',
       '73', '29.0', '64'], dtype=object)

In [35]:
# Lets convert it to integer.
data['Age'] = data['Age'].astype('float').astype('Int16')

This column (**Age**) has still some nan values, so, I am leaving it as it is because data is not available.

### Seperate "internal_link" column from this dataset.

In [36]:
internal_link_file = data['internal_link']

In [37]:
internal_link_file.to_csv('internal_link_file.csv', encoding='utf-8')

In [38]:
data.drop(columns=['internal_link'], inplace=True)

### 2. Table --> female_data

In [39]:
female_data

Unnamed: 0.1,Unnamed: 0,female_name
0,0,Francoise Bettencourt Meyers & family
1,1,Alice Walton
2,2,Julia Koch & family
3,3,Jacqueline Mars
4,4,Miriam Adelson & family
...,...,...
366,366,Hedda im Brahm-Droege
367,367,Pollyanna Chu
368,368,Ana Maria Brescia Cafferata
369,369,Vera Rechulski Santo Domingo


### Remove & family from name column.

In [40]:
female_data[female_data['female_name'].str.contains('& family')].head()

Unnamed: 0.1,Unnamed: 0,female_name
0,0,Francoise Bettencourt Meyers & family
2,2,Julia Koch & family
4,4,Miriam Adelson & family
9,9,Iris Fontbona & family
12,12,Savitri Jindal & family


In [41]:
female_data['female_name'] = female_data['female_name'].str.replace('& family','').str.strip()

###  Create Sex column with value Female

In [42]:
female_data['Sex'] = 'Female'

In [43]:
female_data

Unnamed: 0.1,Unnamed: 0,female_name,Sex
0,0,Francoise Bettencourt Meyers,Female
1,1,Alice Walton,Female
2,2,Julia Koch,Female
3,3,Jacqueline Mars,Female
4,4,Miriam Adelson,Female
...,...,...,...
366,366,Hedda im Brahm-Droege,Female
367,367,Pollyanna Chu,Female
368,368,Ana Maria Brescia Cafferata,Female
369,369,Vera Rechulski Santo Domingo,Female


### This dataset should not be independent, merge it with data.

In [44]:
# Apply left merge with common column 'name'.
billionaires_data_cleaned = data.merge(female_data, how='left', left_on='Name', right_on='female_name')

In [45]:
billionaires_data_cleaned

Unnamed: 0,Unnamed: 0_x,Image,Rank,Name,Net_Worth,Age,Source,Country,Industry,Unnamed: 0_y,female_name,Sex
0,0,https://specials-images.forbesimg.com/imageser...,1,Elon Musk,242.7,52,"Tesla, SpaceX",United States,Automotive,,,
1,439,https://specials-images.forbesimg.com/imageser...,2,Bernard Arnault,224.1,74,LVMH,France,Fashion & Retail,,,
2,2479,https://specials-images.forbesimg.com/imageser...,3,Jeff Bezos,149.8,59,Amazon,United States,Technology,,,
3,2480,https://specials-images.forbesimg.com/imageser...,4,Larry Ellison,144.5,78,Oracle,United States,Technology,,,
4,2481,https://specials-images.forbesimg.com/imageser...,5,Bill Gates,118.0,67,Microsoft,United States,Technology,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2604,2829,https://specials-images.forbesimg.com/imageser...,2606,Chang Jing,1.0,40,Technology,China,Technology,,,
2605,2437,https://specials-images.forbesimg.com/imageser...,2607,Vijay Shekhar Sharma,1.0,45,financial technology,India,Service,,,
2606,726,https://specials-images.forbesimg.com/imageser...,2608,Kanye West,0.4,46,"music, sneakers",United States,Fashion & Retail,,,
2607,1143,https://specials-images.forbesimg.com/imageser...,2610,Sam Bankman-Fried,0.0,31,cryptocurrency exchange,United States,Finance & Investments,,,


## After merging the both dataset, New issue Occured.


### Issues with this Dataset

#### 1. table -> billionaires_data_cleaned

`Unnamed: 0_x, Unnamed: 0_y, female_name` column should not be there, drop it.

`Sex` Column has some null values. Replace it with 'Male'

Some Billionaires Net Worth is less than 1 Billion, As as that will not be considered in billionaires. Drop it


### Dropping Unnamed: 0_x, Unnamed: 0_y, female_name columns

In [46]:
billionaires_data_cleaned.drop(columns = ['Unnamed: 0_x','Unnamed: 0_y','female_name'], inplace=True)

In [47]:
def nan_to_male(value):
    if value =='Female':
        return value
    else:
        return "Male"

In [48]:
billionaires_data_cleaned['Sex'] = billionaires_data_cleaned['Sex'].apply(nan_to_male)

In [49]:
billionaires_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2609 entries, 0 to 2608
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Image      2609 non-null   object 
 1   Rank       2609 non-null   Int16  
 2   Name       2609 non-null   object 
 3   Net_Worth  2609 non-null   float64
 4   Age        2553 non-null   Int16  
 5   Source     2609 non-null   object 
 6   Country    2609 non-null   object 
 7   Industry   2609 non-null   object 
 8   Sex        2609 non-null   object 
dtypes: Int16(2), float64(1), object(6)
memory usage: 178.3+ KB


In [53]:
# Removing Billionaires, whose Net_Worth is less than 1 billion.
billionaires_data_cleaned[billionaires_data_cleaned['Net_Worth'] < 1]

Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,Industry,Sex
2606,https://specials-images.forbesimg.com/imageser...,2608,Kanye West,0.4,46,"music, sneakers",United States,Fashion & Retail,Male
2607,https://specials-images.forbesimg.com/imageser...,2610,Sam Bankman-Fried,0.0,31,cryptocurrency exchange,United States,Finance & Investments,Male
2608,https://specials-images.forbesimg.com/imageser...,2611,Elizabeth Holmes,0.0,39,blood testing,United States,Healthcare,Female


In [56]:
billionaires_data_cleaned = billionaires_data_cleaned[billionaires_data_cleaned['Net_Worth'] >= 1]

#### Now our data is totally cleaned, it is now ready for analysis.

In [58]:
billionaires_data_cleaned.to_csv('billionaires_cleaned.csv', encoding='utf-8')