In [22]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix ,precision_score 



In [23]:
# URL of the Wikipedia page to scrape
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

# Find the table with class 'wikitable sortable'
table = soup.find('table', class_='wikitable sortable')

# Extract table headers
world_titles = table.find_all('th')
world_table_titles = [title.text.strip() for title in world_titles]

# Create an empty DataFrame with the extracted headers
df = pd.DataFrame(columns=world_table_titles)
df



Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [24]:
# Extract data from the table rows and populate the DataFrame
column_data = table.find_all('tr')
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = individual_row_data

df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,9.4%,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,10.6%,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.5%,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,82.5%,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,14.5%,130000,"Waltham, Massachusetts"


In [25]:
# Save initial DataFrame to CSV for verification
df.to_csv(r'C:\hehe\company_modify.csv', index=False)
df.to_csv(r'C:\hehe\company_.csv', index=False)


In [26]:
# Clean and preprocess data
df['Revenue (USD millions)'] = df['Revenue (USD millions)'].str.replace(',', '').astype(float)

df['Employees'] = df['Employees'].str.replace(r'\[.*\]', '', regex=True)
df['Employees'] = df['Employees'].str.replace(',', '')
df['Employees'] = pd.to_numeric(df['Employees'], errors='coerce').fillna(0).astype(int)

#Creating a 'Rating' column based on revenue thresholds
df['Rating'] = 0
df.loc[df['Revenue (USD millions)'] > 100000, 'Rating'] = 5
df.loc[(df['Revenue (USD millions)'] <= 100000) & (df['Revenue (USD millions)'] > 50000), 'Rating'] = 4
df.loc[(df['Revenue (USD millions)'] <= 50000) & (df['Revenue (USD millions)'] > 10000), 'Rating'] = 3
df.loc[(df['Revenue (USD millions)'] <= 10000) & (df['Revenue (USD millions)'] > 1000), 'Rating'] = 2
df.loc[(df['Revenue (USD millions)'] <= 1000), 'Rating'] = 1


df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,Rating
0,1,Walmart,Retail,611289.0,6.7%,2100000,"Bentonville, Arkansas",5
1,2,Amazon,Retail and cloud computing,513983.0,9.4%,1540000,"Seattle, Washington",5
2,3,ExxonMobil,Petroleum industry,413680.0,44.8%,62000,"Spring, Texas",5
3,4,Apple,Electronics industry,394328.0,7.8%,164000,"Cupertino, California",5
4,5,UnitedHealth Group,Healthcare,324162.0,12.7%,400000,"Minnetonka, Minnesota",5
...,...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298.0,10.6%,71100,"Richfield, Minnesota",3
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159.0,0.5%,34300,"New York City, New York",3
97,98,United Airlines,Airline,44955.0,82.5%,92795,"Chicago, Illinois",3
98,99,Thermo Fisher Scientific,Laboratory instruments,44915.0,14.5%,130000,"Waltham, Massachusetts",3


In [27]:
# Save the DataFrame with ratings to CSV
df.to_csv(r'C:\hehe\company_with_rating.csv', index=False)

# Reload DataFrame for further processing
df = pd.read_csv(r'C:\hehe\company_with_rating.csv')

# Classification: Predicting 'Rating' based on 'Revenue (USD millions)' and 'Employees'
X = df[['Revenue (USD millions)', 'Employees']]
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#using ml component DecisionTree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Adding predictions to DataFrame
df['Predicted_Rating'] = clf.predict(X)
df



Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,Rating,Predicted_Rating
0,1,Walmart,Retail,611289.0,6.7%,2100000,"Bentonville, Arkansas",5,5
1,2,Amazon,Retail and cloud computing,513983.0,9.4%,1540000,"Seattle, Washington",5,5
2,3,ExxonMobil,Petroleum industry,413680.0,44.8%,62000,"Spring, Texas",5,5
3,4,Apple,Electronics industry,394328.0,7.8%,164000,"Cupertino, California",5,5
4,5,UnitedHealth Group,Healthcare,324162.0,12.7%,400000,"Minnetonka, Minnesota",5,5
...,...,...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298.0,10.6%,71100,"Richfield, Minnesota",3,3
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159.0,0.5%,34300,"New York City, New York",3,3
97,98,United Airlines,Airline,44955.0,82.5%,92795,"Chicago, Illinois",3,3
98,99,Thermo Fisher Scientific,Laboratory instruments,44915.0,14.5%,130000,"Waltham, Massachusetts",3,3


In [28]:
# Save DataFrame with predictions to CSV
df.to_csv(r'C:\hehe\company_with_predictions.csv', index=False)

# Evaluation metrics for classification
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)
print(f"Precision: {precision}")
df

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         9

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Confusion Matrix:
 [[ 1  0  0]
 [ 0 10  0]
 [ 0  0  9]]
Precision: 1.0


Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,Rating,Predicted_Rating
0,1,Walmart,Retail,611289.0,6.7%,2100000,"Bentonville, Arkansas",5,5
1,2,Amazon,Retail and cloud computing,513983.0,9.4%,1540000,"Seattle, Washington",5,5
2,3,ExxonMobil,Petroleum industry,413680.0,44.8%,62000,"Spring, Texas",5,5
3,4,Apple,Electronics industry,394328.0,7.8%,164000,"Cupertino, California",5,5
4,5,UnitedHealth Group,Healthcare,324162.0,12.7%,400000,"Minnetonka, Minnesota",5,5
...,...,...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298.0,10.6%,71100,"Richfield, Minnesota",3,3
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159.0,0.5%,34300,"New York City, New York",3,3
97,98,United Airlines,Airline,44955.0,82.5%,92795,"Chicago, Illinois",3,3
98,99,Thermo Fisher Scientific,Laboratory instruments,44915.0,14.5%,130000,"Waltham, Massachusetts",3,3


In [29]:
#with modified revenue values
df1 = pd.read_csv(r'C:\hehe\company_modify.csv',encoding='ISO-8859-1')
df1

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,56456,6.70%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,56,9.40%,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,5,44.80%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,567,7.80%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.70%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,10.60%,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.50%,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,82.50%,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,14.50%,130000,"Waltham, Massachusetts"


In [30]:
# Function to process and predict ratings
def process_and_predict(df):
   
    df['Revenue (USD millions)'] = df['Revenue (USD millions)'].str.replace(',', '').astype(float)
    
    
    df['Employees'] = df['Employees'].str.replace(r'\[.*\]', '', regex=True)
    df['Employees'] = df['Employees'].str.replace(',', '')
    df['Employees'] = pd.to_numeric(df['Employees'], errors='coerce').fillna(0).astype(int)
    
    
    df['Rating'] = 0
    df.loc[df['Revenue (USD millions)'] > 100000, 'Rating'] = 5
    df.loc[(df['Revenue (USD millions)'] <= 100000) & (df['Revenue (USD millions)'] > 50000), 'Rating'] = 4
    df.loc[(df['Revenue (USD millions)'] <= 50000) & (df['Revenue (USD millions)'] > 10000), 'Rating'] = 3
    df.loc[(df['Revenue (USD millions)'] <= 10000) & (df['Revenue (USD millions)'] > 1000), 'Rating'] = 2
    df.loc[(df['Revenue (USD millions)'] <= 1000), 'Rating'] = 1
    
   
    df.to_csv(r'C:\hehe\company_with_rating.csv', index=False)
    
    
    df = pd.read_csv(r'C:\hehe\company_with_rating.csv')
    
    
    X = df[['Revenue (USD millions)', 'Employees']]
    y = df['Rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    
    y_pred = clf.predict(X_test)
    
 
    df['Predicted_Rating'] = clf.predict(X)
    df.to_csv(r'C:\hehe\company_with_predictions.csv', index=False)
    
   
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    
    print(f"Accuracy: {accuracy}")
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)
    
    return df




In [31]:
# Example usage of the function for df with modified revenue values
process_and_predict(df1)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         9
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         9
           5       1.00      1.00      1.00         1

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Confusion Matrix:
 [[9 0 0 0]
 [0 1 0 0]
 [0 0 9 0]
 [0 0 0 1]]


Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,Rating,Predicted_Rating
0,1,Walmart,Retail,56456.0,6.70%,2100000,"Bentonville, Arkansas",4,4
1,2,Amazon,Retail and cloud computing,56.0,9.40%,1540000,"Seattle, Washington",1,1
2,3,ExxonMobil,Petroleum industry,5.0,44.80%,62000,"Spring, Texas",1,1
3,4,Apple,Electronics industry,567.0,7.80%,164000,"Cupertino, California",1,1
4,5,UnitedHealth Group,Healthcare,324162.0,12.70%,400000,"Minnetonka, Minnesota",5,5
...,...,...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298.0,10.60%,71100,"Richfield, Minnesota",3,3
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159.0,0.50%,34300,"New York City, New York",3,3
97,98,United Airlines,Airline,44955.0,82.50%,92795,"Chicago, Illinois",3,3
98,99,Thermo Fisher Scientific,Laboratory instruments,44915.0,14.50%,130000,"Waltham, Massachusetts",3,3


In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Regression: Predicting 'Revenue (USD millions)' based on 'Employees'
df2=pd.read_csv(r'C:\hehe\company_.csv')
X_reg = df[['Employees']]
y_reg = df['Revenue (USD millions)']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

#using ml component linear regression
reg = LinearRegression()
reg.fit(X_train_reg, y_train_reg)

y_pred_reg = reg.predict(X_test_reg)

# Evaluation metrics for regression
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Adding regression predictions to DataFrame
df['Predicted_Revenue'] = reg.predict(df[['Employees']])
df.to_csv(r'C:\hehe\company_with_predictions_and_revenue.csv', index=False)
df

Mean Squared Error: 4030159999.511718
R-squared: 0.7590107203644677


Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,Rating,Predicted_Rating,Predicted_Revenue
0,1,Walmart,Retail,611289.0,6.7%,2100000,"Bentonville, Arkansas",5,5,520489.047392
1,2,Amazon,Retail and cloud computing,513983.0,9.4%,1540000,"Seattle, Washington",5,5,403318.643062
2,3,ExxonMobil,Petroleum industry,413680.0,44.8%,62000,"Spring, Texas",5,5,94072.468775
3,4,Apple,Electronics industry,394328.0,7.8%,164000,"Cupertino, California",5,5,115414.220993
4,5,UnitedHealth Group,Healthcare,324162.0,12.7%,400000,"Minnetonka, Minnesota",5,5,164793.177103
...,...,...,...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298.0,10.6%,71100,"Richfield, Minnesota",3,3,95976.487846
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159.0,0.5%,34300,"New York City, New York",3,3,88276.718418
97,98,United Airlines,Airline,44955.0,82.5%,92795,"Chicago, Illinois",3,3,100515.794849
98,99,Thermo Fisher Scientific,Laboratory instruments,44915.0,14.5%,130000,"Waltham, Massachusetts",3,3,108300.303587


In [33]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix


df2 = df.copy()  
y_pred_reg = reg.predict(X_test_reg)

# Binning predicted and actual revenues for evaluation
bins = [0, 1000, 10000, 50000, 100000, np.inf]
labels = [1, 2, 3, 4, 5]


y_test_binned = pd.cut(y_test_reg, bins=bins, labels=labels)
y_pred_binned = pd.cut(y_pred_reg, bins=bins, labels=labels)


accuracy = accuracy_score(y_test_binned, y_pred_binned)
precision = precision_score(y_test_binned, y_pred_binned, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_test_binned, y_pred_binned)


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.65
Precision: 0.6375
Confusion Matrix:
 [[0 0 1]
 [0 6 4]
 [0 2 7]]
