In [1]:
#importting required libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import requests
import re
#pd.set_option('display.max_rows', None)

In [2]:
#loading data
data = pd.read_csv("billionare_file.csv")
female_data = pd.read_csv("female_billionares.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
0,0,https://specials-images.forbesimg.com/imageser...,1.0,Elon Musk,$220.7 B,52.0,"Tesla, SpaceX",United States,https://www.forbes.com/profile/elon-musk/?list...,Automotive
1,1,https://specials-images.forbesimg.com/imageser...,42.0,Zeng Robin,$32.9 B,54.0,Batteries,Hong Kong,https://www.forbes.com/profile/zeng-robin/?lis...,Automotive
2,2,https://specials-images.forbesimg.com/imageser...,60.0,Stefan Quandt,$25.1 B,57.0,BMW,Germany,https://www.forbes.com/profile/stefan-quandt/?...,Automotive
3,3,https://specials-images.forbesimg.com/imageser...,66.0,Susanne Klatten,$24.1 B,61.0,"BMW, pharmaceuticals",Germany,https://www.forbes.com/profile/susanne-klatten...,Automotive
4,4,//specials-images.forbesimg.com/imageserve/5dc...,93.0,Wang Chuanfu,$18.0 B,57.0,"Batteries, automobiles",China,https://www.forbes.com/profile/wang-chuanfu/?l...,Automotive


In [4]:
female_data.head()

Unnamed: 0.1,Unnamed: 0,female_name
0,0,Francoise Bettencourt Meyers & family
1,1,Alice Walton
2,2,Julia Koch & family
3,3,Jacqueline Mars
4,4,MacKenzie Scott


# Summary of the Data

This Data is about more than 2600 Billionaires that is listed on <b>"Forbes Real Time Billionaire"</b> of year 2023. This Dataset provides some information about billionaires. This Data has been scraped from "https://www.forbes.com/real-time-billionaires/" here on 15th of August.
 
## Columns Description

#### **Table** -> `data`:

- `Image` - Image of billionaires in small size as link.

- `Rank` -  Billionaires's rank based on their worth value calculated by forbes.

- `Name` - Name of the billionaire.

- `Net-Worth` - This column contains the net worth of billionaire in real-time in billions.

- `Age` - Age of the billionaire.

- `Source` - This column hold company name through which they earn money.

- `Country` - In which country they belong to or their company.

- `internal_link` - This column has links that holds some extra information about the billionaires. Further this column will be used to scrape the extra information about billionaires.

- `Industry` - Their company belong to which industry.

#### **Table** -> `female_data`:

**female_data** only contains billionaires name who is female, so that I can distinguish between male and female billionaires. For this, in **female_data**, I will create a new column called **Sex** and value will be female.

### Issues with Dataset

#### 1. table -> data
   - `Image Column` - Some of the link are not complete, 'https://' are missing.
   
   - `Name column` - In name column, some rows contains **& family** with their name in last. - `Dirty Data`
   
   - `Rank Column` - This column not in order (in Ascending). It is random.
   
   - `Net_Worth` - B and dollar sign is included in net_worth value. - `Dirty Data`
   
   - `Age` - This column has some missing values.
   
   - There are some null values, which came during scraping, there is no any billionaires data missing. So, I will drop it later. - `Dirty Data`
   
#### 2. table -> female_data
   - `Name Column` - In name column, some rows contains **& family** with their name in last. - `Dirty Data`
   - **Sex** column is not there, I will create it. 

## Data Cleaning

#### Table = data

- Remove null vaues
- Add "'https://" in incomplete rows in image column.
- make rank column in ascending order.
- Remove **& family** from name column.
- Remove dollar sign and B from net_worth column.
- Fill age value as much as possible in age column.
- Seperate "internal_link" column from this dataset.
- Incorrect data type assigned to rank, net_worth, and age column. Convert it.


#### Table = female_data

- Remove **& family** from name column.
- Create **Sex** column with value **Female**
- This dataset should not be independent, merge it with data.

### 1. Table --> Data

### Remove null vaues

In [6]:
#Checking for null values
data.isnull().sum()
# As you can see, most of the columns have same number of null values, this null values came during scraping.
# Here, no data of any billionares are missing. So, I will remove this null value. i.e. |255.

Unnamed: 0         0
Image            252
Rank             252
Name             252
Net_Worth        252
Age              326
Source           252
Country          252
internal_link    252
Industry           0
dtype: int64

In [7]:
female_data.isnull().sum()
# As you can see, most of the columns have same number of null values here also, this null values came during scraping.
# Here, no data of any billionares are missing. So, I will remove this null value. i.e. 33.

Unnamed: 0      0
female_name    33
dtype: int64

In [8]:
data[data['Image'].isnull()]

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
10,10,,,,,,,,,Automotive
21,21,,,,,,,,,Automotive
32,32,,,,,,,,,Automotive
43,43,,,,,,,,,Automotive
54,54,,,,,,,,,Automotive
...,...,...,...,...,...,...,...,...,...,...
2798,2798,,,,,,,,,Technology
2809,2809,,,,,,,,,Technology
2820,2820,,,,,,,,,Technology
2840,2840,,,,,,,,,Telecom


In [9]:
data = data[data['Image'].notnull()]
female_data = female_data[female_data['female_name'].notnull()]

In [10]:
data.isnull().sum()
# Now only 77 value from age is missing.

Unnamed: 0        0
Image             0
Rank              0
Name              0
Net_Worth         0
Age              74
Source            0
Country           0
internal_link     0
Industry          0
dtype: int64

In [11]:
female_data.isnull().sum()
# No missing values

Unnamed: 0     0
female_name    0
dtype: int64

In [12]:
print(data.shape)
print(female_data.shape)

(2609, 10)
(334, 2)


### Add "'https://" in incomplete rows in Image Column.

In [13]:
data['Image']
# In this column, you will notice that some links are pure links, but some links are incomplete. "https: is missing."
# let add it

0       https://specials-images.forbesimg.com/imageser...
1       https://specials-images.forbesimg.com/imageser...
2       https://specials-images.forbesimg.com/imageser...
3       https://specials-images.forbesimg.com/imageser...
4       //specials-images.forbesimg.com/imageserve/5dc...
                              ...                        
2856    //specials-images.forbesimg.com/imageserve/5e7...
2857    //specials-images.forbesimg.com/imageserve/5a8...
2858    https://specials-images.forbesimg.com/imageser...
2859    //specials-images.forbesimg.com/imageserve/5ab...
2860    //specials-images.forbesimg.com/imageserve/5ab...
Name: Image, Length: 2609, dtype: object

In [14]:
def full_link(value):
    if value.startswith('https:'):
        return value
    else:
        return 'https:' + value

In [15]:
data['Image'] = data['Image'].apply(full_link)

In [16]:
data['Image']

0       https://specials-images.forbesimg.com/imageser...
1       https://specials-images.forbesimg.com/imageser...
2       https://specials-images.forbesimg.com/imageser...
3       https://specials-images.forbesimg.com/imageser...
4       https://specials-images.forbesimg.com/imageser...
                              ...                        
2856    https://specials-images.forbesimg.com/imageser...
2857    https://specials-images.forbesimg.com/imageser...
2858    https://specials-images.forbesimg.com/imageser...
2859    https://specials-images.forbesimg.com/imageser...
2860    https://specials-images.forbesimg.com/imageser...
Name: Image, Length: 2609, dtype: object

### make rank column in ascending order and correct data type

In [17]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
0,0,https://specials-images.forbesimg.com/imageser...,1.0,Elon Musk,$220.7 B,52.0,"Tesla, SpaceX",United States,https://www.forbes.com/profile/elon-musk/?list...,Automotive
1,1,https://specials-images.forbesimg.com/imageser...,42.0,Zeng Robin,$32.9 B,54.0,Batteries,Hong Kong,https://www.forbes.com/profile/zeng-robin/?lis...,Automotive


In [18]:
data.sort_values('Rank', inplace=True)

In [19]:
data.reset_index(inplace=True) # reseting index value
data.drop(columns=['index'], inplace=True)

In [20]:
# converting float64 to int16
data['Rank'] = data['Rank'].astype('Int16')

### Remove '**& family**' from name column.

In [22]:
data[data['Name'].str.contains('& family')].head()

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
1,437,https://specials-images.forbesimg.com/imageser...,2,Bernard Arnault & family,$216.3 B,74.0,LVMH,France,https://www.forbes.com/profile/bernard-arnault...,Fashion & Retail
11,2830,https://specials-images.forbesimg.com/imageser...,12,Carlos Slim Helu & family,$95.0 B,83.0,Telecom,Mexico,https://www.forbes.com/profile/carlos-slim-hel...,Telecom
13,438,https://specials-images.forbesimg.com/imageser...,14,Francoise Bettencourt Meyers & family,$89.1 B,70.0,L'Oréal,France,https://www.forbes.com/profile/francoise-bette...,Fashion & Retail
20,130,https://specials-images.forbesimg.com/imageser...,21,Julia Koch & family,$60.2 B,61.0,Koch Industries,United States,https://www.forbes.com/profile/julia-koch/?lis...,Diversified
22,1987,https://specials-images.forbesimg.com/imageser...,23,David Thomson & family,$56.8 B,66.0,Media,Canada,https://www.forbes.com/profile/david-thomson/?...,Media & Entertainment


In [23]:
data['Name'] = data['Name'].str.replace('& family','').str.strip()

### Remove dollar sign and B from net_worth column and correct data type

In [25]:
data['Net_Worth'].sample(5)

2160     $1.3 B
54      $27.9 B
1575     $1.9 B
2466     $1.1 B
539      $5.1 B
Name: Net_Worth, dtype: object

In [26]:
data['Net_Worth'] = data['Net_Worth'].str.replace('$','').str.replace('B','').str.strip().astype(float)

### Fill age value as much as possible in Age column and correct data type

In [28]:
# These are the names, whose age is not available. I will try to fill it.
data[data['Age'].isnull()]['Name']

100          Beate Heister
101      Karl Albrecht Jr.
249     Francine von Finck
470            Zhou Qunfei
506        Marcos Galperin
               ...        
2499      Francesco Saputo
2517           Zong Yanmin
2545        Shao Jianxiong
2549           Cai Hongbin
2558            Zhang Hong
Name: Name, Length: 74, dtype: object

In [29]:
# lets fill nan values
# Age Column is important, so I will use beautifulsoup to fill it by searching it one google.
for index, row in data.iterrows():
    if row['Age'] is None or pd.isna(row['Age']):
        search_name = row['Name']
        search_query = f"{search_name} age"
        #print(search_query)
        response = requests.get(f"https://www.google.com/search?q={search_query}")
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            data.loc[index,'Age'] = soup.find('div', {'class': 'AVsepf'}).text
            print(soup.find('div', {'class': 'AVsepf'}).text)
        except:
            data.loc[index,'Age'] = np.nan
            print(np.nan)   

Born: 5 October 1951 (age 71 years), Essen, Germany
Born: 1948 (age 75 years), Essen, Germany
Born: 2 May 1968 (age 55 years), Munich, Germany
Born: 1970 (age 53 years), Xiangxiang, Xiangtan, China
Born: 31 October 1971 (age 51 years), Buenos Aires, Argentina
nan
nan
nan
Born: 1930 (age 93 years)
nan
Born: 9 June 1975 (age 48 years), Hamburg, Germany
nan
nan
nan
Born: 4 November 1969 (age 53 years), Bielefeld, Germany
nan
nan
nan
Born: 1968 (age 55 years)
Born: 11 August 1946 (age 77 years)
nan
nan
nan
nan
Born: 3 May 1952 (age 71 years), Suresnes, France
nan
nan
Revenue: 510 crores EUR (2022)
nan
nan
nan
nan
nan
Born: 15 October 1975 (age 47 years)
nan
Born: 23 November 1963 (age 59 years), Lima, Peru
nan
nan
nan
Born: 23 August 1982 (age 40 years), Wuhan, China
Born: 24 January 1979 (age 44 years), Shanghai, China
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Born: 27 March 1909
nan
nan
nan
nan
nan
nan
Born: 13 November 1958 (age 64 years), Florenceville-Bristol, Canada
nan
nan
nan

In [30]:
data['Age'].unique()

array([52.0, 74.0, 59.0, 78.0, 92.0, 67.0, 50.0, 39.0, 49.0, 81.0, 83.0,
       66.0, 70.0, 87.0, 75.0, 58.0, 73.0, 68.0, 61.0, 85.0, 31.0, 60.0,
       86.0, 95.0, 53.0, 51.0, 77.0, 54.0, 72.0, 82.0, 76.0, 88.0, 65.0,
       69.0, 43.0, 57.0, 62.0, 36.0, 80.0, 71.0, 91.0,
       'Born: 5 October 1951 (age 71\xa0years), Essen, Germany',
       'Born: 1948 (age 75\xa0years), Essen, Germany', 41.0, 56.0, 55.0,
       90.0, 94.0, 47.0, 84.0, 64.0, 93.0, 96.0, 79.0, 89.0, 38.0, 63.0,
       99.0, 46.0, 44.0, 45.0, 48.0, 40.0,
       'Born: 2 May 1968 (age 55\xa0years), Munich, Germany', 42.0, 97.0,
       'Born: 1970 (age 53\xa0years), Xiangxiang, Xiangtan, China', 33.0,
       34.0,
       'Born: 31 October 1971 (age 51\xa0years), Buenos Aires, Argentina',
       19.0, 28.0, 21.0, nan, 37.0, 'Born: 1930 (age 93\xa0years)', 35.0,
       30.0, 'Born: 9 June 1975 (age 48\xa0years), Hamburg, Germany',
       'Born: 4 November 1969 (age 53\xa0years), Bielefeld, Germany',
       20.0, 98.0, 'Bo

In [31]:
# converting to str so that I can apply string operation to split and fetch age.
data['Age'] = data['Age'].astype(str)

In [32]:
data['Age'] = data['Age'].str.split('age').str.get(-1).str.split('years').str.get(0).str.strip()

In [33]:
data['Age'].unique()

array(['52.0', '74.0', '59.0', '78.0', '92.0', '67.0', '50.0', '39.0',
       '49.0', '81.0', '83.0', '66.0', '70.0', '87.0', '75.0', '58.0',
       '73.0', '68.0', '61.0', '85.0', '31.0', '60.0', '86.0', '95.0',
       '53.0', '51.0', '77.0', '54.0', '72.0', '82.0', '76.0', '88.0',
       '65.0', '69.0', '43.0', '57.0', '62.0', '36.0', '80.0', '71.0',
       '91.0', '71', '75', '41.0', '56.0', '55.0', '90.0', '94.0', '47.0',
       '84.0', '64.0', '93.0', '96.0', '79.0', '89.0', '38.0', '63.0',
       '99.0', '46.0', '44.0', '45.0', '48.0', '40.0', '55', '42.0',
       '97.0', '53', '33.0', '34.0', '51', '19.0', '28.0', '21.0', 'nan',
       '37.0', '93', '35.0', '30.0', '48', '20.0', '98.0', '77', '100.0',
       'Revenue: 510 crores EUR (2022)', '47', '59', '27.0', '40', '44',
       '101.0', 'Born: 27 March 1909', '64', '74', '29.0', '35'],
      dtype=object)

In [35]:
# After splitting and filling age, two row's values are still incorrect. Lets correct it
# 1.
data[data['Age'] == 'Revenue: 510 crores EUR (2022)']
# 

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
1492,51,https://specials-images.forbesimg.com/imageser...,1493,Fritz Draexlmaier,2.0,Revenue: 510 crores EUR (2022),Auto parts,Germany,https://www.forbes.com/profile/fritz-draexlmai...,Automotive


By searching this name **"Fritz Draexlmaier"** manually on google I found that, his age is 65 years old. In future, You will have to again maually fill it, if age changes.

In [39]:
data.loc[1492,'Age'] = np.nan

In [40]:
2.
data[data['Age'] == 'Born: 27 March 1909']
# Information fetched is wrong.

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,internal_link,Industry
2257,1929,https://specials-images.forbesimg.com/imageser...,2257,Karl Knauf,1.2,Born: 27 March 1909,Building materials,Germany,https://www.forbes.com/profile/karl-knauf/?lis...,Manufacturing


In [42]:
data.loc[2257,'Age'] = np.nan

In [43]:
data['Age'].unique()

array(['52.0', '74.0', '59.0', '78.0', '92.0', '67.0', '50.0', '39.0',
       '49.0', '81.0', '83.0', '66.0', '70.0', '87.0', '75.0', '58.0',
       '73.0', '68.0', '61.0', '85.0', '31.0', '60.0', '86.0', '95.0',
       '53.0', '51.0', '77.0', '54.0', '72.0', '82.0', '76.0', '88.0',
       '65.0', '69.0', '43.0', '57.0', '62.0', '36.0', '80.0', '71.0',
       '91.0', '71', '75', '41.0', '56.0', '55.0', '90.0', '94.0', '47.0',
       '84.0', '64.0', '93.0', '96.0', '79.0', '89.0', '38.0', '63.0',
       '99.0', '46.0', '44.0', '45.0', '48.0', '40.0', '55', '42.0',
       '97.0', '53', '33.0', '34.0', '51', '19.0', '28.0', '21.0', 'nan',
       '37.0', '93', '35.0', '30.0', '48', '20.0', '98.0', '77', '100.0',
       nan, '47', '59', '27.0', '40', '44', '101.0', '64', '74', '29.0',
       '35'], dtype=object)

In [44]:
# Lets convert it to integer.
data['Age'] = data['Age'].astype('float').astype('Int16')

This column (**Age**) has still some nan values, so, I am leaving it as it is because data is not available.

### Seperate "internal_link" column from this dataset.

In [45]:
internal_link_file = data['internal_link']

In [46]:
internal_link_file.to_csv('internal_link_file.csv', encoding='utf-8')

In [47]:
data.drop(columns=['internal_link'], inplace=True)

In [49]:
data.sample(10)

Unnamed: 0.1,Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,Industry
1463,2282,https://specials-images.forbesimg.com/imageser...,1464,Katsumi Tada,2.1,78,real estate,Japan,Real Estate
816,539,https://specials-images.forbesimg.com/imageser...,817,Takao Yasuda,3.6,74,retail,Japan,Fashion & Retail
459,1424,https://specials-images.forbesimg.com/imageser...,460,Patrick Soon-Shiong,5.9,71,Pharmaceuticals,United States,Healthcare
1314,964,https://specials-images.forbesimg.com/imageser...,1315,Thomas James,2.4,81,Finance,United States,Finance & Investments
1300,2268,https://specials-images.forbesimg.com/imageser...,1301,Mitchell Goldhar,2.4,62,Real estate,Canada,Real Estate
213,2522,https://specials-images.forbesimg.com/imageser...,214,Hasso Plattner,9.5,79,Software,Germany,Technology
2227,1649,https://specials-images.forbesimg.com/imageser...,2228,Martin Moller Nielsen,1.3,59,Aircraft leasing,Denmark,Logistics
1085,574,https://specials-images.forbesimg.com/imageser...,1086,Joy Alukkas,2.8,66,Jewelry,India,Fashion & Retail
339,1185,https://specials-images.forbesimg.com/imageser...,340,Pauline MacMillan Keinath,7.2,89,Cargill,United States,Food & Beverage
408,2196,https://specials-images.forbesimg.com/imageser...,409,Mangal Prabhat Lodha,6.5,67,Real estate,India,Real Estate


### 2. Table --> female_data

In [50]:
female_data

Unnamed: 0.1,Unnamed: 0,female_name
0,0,Francoise Bettencourt Meyers & family
1,1,Alice Walton
2,2,Julia Koch & family
3,3,Jacqueline Mars
4,4,MacKenzie Scott
...,...,...
361,361,Cristina Junqueira
363,363,Pollyanna Chu
364,364,Ana Maria Brescia Cafferata
365,365,Ma Xiuhui


### Remove & family from name column.

In [52]:
female_data[female_data['female_name'].str.contains('& family')].sample(5)

Unnamed: 0.1,Unnamed: 0,female_name
83,83,Maria Fernanda Amorim & family
160,160,Eleanor Butt Crook & family
104,104,Chan Laiwa & family
33,33,Antonia Ax:son Johnson & family
2,2,Julia Koch & family


In [53]:
female_data['female_name'] = female_data['female_name'].str.replace('& family','').str.strip()

###  Create Sex column with value Female

In [54]:
female_data['Sex'] = 'Female'

In [55]:
female_data

Unnamed: 0.1,Unnamed: 0,female_name,Sex
0,0,Francoise Bettencourt Meyers,Female
1,1,Alice Walton,Female
2,2,Julia Koch,Female
3,3,Jacqueline Mars,Female
4,4,MacKenzie Scott,Female
...,...,...,...
361,361,Cristina Junqueira,Female
363,363,Pollyanna Chu,Female
364,364,Ana Maria Brescia Cafferata,Female
365,365,Ma Xiuhui,Female


### This dataset should not be independent, merge it with data.

In [56]:
# Apply left merge with common column 'name'.
billionaires_data_cleaned = data.merge(female_data, how='left', left_on='Name', right_on='female_name')

In [57]:
billionaires_data_cleaned

Unnamed: 0,Unnamed: 0_x,Image,Rank,Name,Net_Worth,Age,Source,Country,Industry,Unnamed: 0_y,female_name,Sex
0,0,https://specials-images.forbesimg.com/imageser...,1,Elon Musk,220.7,52,"Tesla, SpaceX",United States,Automotive,,,
1,437,https://specials-images.forbesimg.com/imageser...,2,Bernard Arnault,216.3,74,LVMH,France,Fashion & Retail,,,
2,2480,https://specials-images.forbesimg.com/imageser...,3,Jeff Bezos,160.7,59,Amazon,United States,Technology,,,
3,2481,https://specials-images.forbesimg.com/imageser...,4,Larry Ellison,145.7,78,Oracle,United States,Technology,,,
4,729,https://specials-images.forbesimg.com/imageser...,5,Warren Buffett,119.1,92,Berkshire Hathaway,United States,Finance & Investments,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2604,2860,https://specials-images.forbesimg.com/imageser...,2606,Zhong Peifeng,1.0,60,semiconductor,United States,Telecom,,,
2605,728,https://specials-images.forbesimg.com/imageser...,2607,Kanye West,0.4,46,"music, sneakers",United States,Fashion & Retail,,,
2606,1610,https://specials-images.forbesimg.com/imageser...,2608,Chung Yong-ji,0.0,53,Biotech,South Korea,Healthcare,,,
2607,1147,https://specials-images.forbesimg.com/imageser...,2609,Sam Bankman-Fried,0.0,31,cryptocurrency exchange,United States,Finance & Investments,,,


## After merging the both dataset, New issue Occured.


### Issues with this Dataset

#### 1. table -> billionaires_data_cleaned

`Unnamed: 0_x, Unnamed: 0_y, female_name` column should not be there, drop it.

`Sex` Column has some null values. Replace it with 'Male'

Some Billionaires Net Worth is less than 1 Billion, As as that will not be considered in billionaires. Drop it


### Dropping Unnamed: 0_x, Unnamed: 0_y, female_name columns

In [58]:
billionaires_data_cleaned.drop(columns = ['Unnamed: 0_x','Unnamed: 0_y','female_name'], inplace=True)

In [59]:
def nan_to_male(value):
    if value =='Female':
        return value
    else:
        return "Male"

In [60]:
billionaires_data_cleaned['Sex'] = billionaires_data_cleaned['Sex'].apply(nan_to_male)

In [61]:
billionaires_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2609 entries, 0 to 2608
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Image      2609 non-null   object 
 1   Rank       2609 non-null   Int16  
 2   Name       2609 non-null   object 
 3   Net_Worth  2609 non-null   float64
 4   Age        2554 non-null   Int16  
 5   Source     2609 non-null   object 
 6   Country    2609 non-null   object 
 7   Industry   2609 non-null   object 
 8   Sex        2609 non-null   object 
dtypes: Int16(2), float64(1), object(6)
memory usage: 158.1+ KB


In [62]:
# Removing Billionaires, whose Net_Worth is less than 1 billion.
billionaires_data_cleaned[billionaires_data_cleaned['Net_Worth'] < 1]

Unnamed: 0,Image,Rank,Name,Net_Worth,Age,Source,Country,Industry,Sex
2605,https://specials-images.forbesimg.com/imageser...,2607,Kanye West,0.4,46,"music, sneakers",United States,Fashion & Retail,Male
2606,https://specials-images.forbesimg.com/imageser...,2608,Chung Yong-ji,0.0,53,Biotech,South Korea,Healthcare,Male
2607,https://specials-images.forbesimg.com/imageser...,2609,Sam Bankman-Fried,0.0,31,cryptocurrency exchange,United States,Finance & Investments,Male
2608,https://specials-images.forbesimg.com/imageser...,2610,Elizabeth Holmes,0.0,39,blood testing,United States,Healthcare,Female


In [63]:
billionaires_data_cleaned = billionaires_data_cleaned[billionaires_data_cleaned['Net_Worth'] >= 1]

In [65]:
billionaires_data_cleaned.shape

(2605, 9)

#### Now our data is totally cleaned, it is now ready for analysis.

In [67]:
billionaires_data_cleaned.to_csv('billionaires_data_cleaned.csv', encoding='utf-8')

# Thank You