In [53]:
#importing library
import numpy as np
import pandas as pd
import openpyxl
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import openpyxl

# Nominal Features
Nominal features are variables or attributes in a dataset that represent categorical data without any inherent order or hierarchy. They are also known as categorical variables or qualitative variables. Nominal features are typically represented by labels or categories, such as colors, gender, or species.

For example, in a dataset of customer information, "gender" would be a nominal feature with two possible categories: male and female. Another example would be the "color" of a car, which could be one of several categories such as red, blue, green, etc.

Nominal features are distinct from ordinal features, which represent categorical data with an inherent order or hierarchy. For example, a rating system from 1 to 5 would be an ordinal feature, because the categories have a clear order.

Nominal features can be used in statistical analysis, but they require special handling since they cannot be ordered or ranked in any meaningful way. One common approach is to use one-hot encoding, which creates new binary variables for each category in the nominal feature. This allows the nominal feature to be represented as a set of numerical variables that can be used in statistical models.



# Ordinal Features

Ordinal features are variables or attributes in a dataset that represent categorical data with an inherent order or hierarchy. They are also known as ordered categorical variables or rank variables. Ordinal features are typically represented by labels or categories that have a clear ordering, such as ratings, levels, or sizes.

For example, in a dataset of restaurant reviews, "rating" would be an ordinal feature with categories such as "poor," "fair," "good," "very good," and "excellent." Another example would be the "size" of a shirt, which could be ordered from "small" to "medium" to "large."

Ordinal features are distinct from nominal features, which represent categorical data without any inherent order or hierarchy. For example, the color of a car would be a nominal feature, since there is no inherent order or ranking to the categories.

Ordinal features can be used in statistical analysis and modeling, but they require special handling to account for their ordering. One common approach is to assign numerical values to the categories based on their rank, such as 1 for "poor," 2 for "fair," and so on. However, care must be taken to ensure that the assigned values reflect the true ordering of the categories, and that they do not imply any numerical relationship between the categories.


# Categorical Features



Categorical features are variables or attributes in a dataset that represent qualitative data that can be divided into discrete categories or groups. They can be nominal, ordinal or binary features. Categorical features are often used to represent non-numerical data or attributes, such as gender, occupation, or type of product.

Nominal categorical features represent categories or labels that have no inherent order or hierarchy, such as colors or types of fruit. Ordinal categorical features, on the other hand, represent categories or labels that have an inherent order or ranking, such as education level or customer satisfaction ratings.

Binary categorical features are a special case of nominal categorical features, where there are only two possible categories, such as yes/no or true/false. Binary features are often used in machine learning classification problems.

Categorical features require special handling in data analysis and modeling, as they cannot be treated as numerical data. One common approach is to use one-hot encoding to create dummy variables for each category in the feature, which allows the categorical feature to be represented numerically. Another approach is to use ordinal encoding, which assigns numerical values to the categories based on their order or ranking. However, care must be taken to ensure that the assigned values reflect the true order or ranking of the categories.

In [54]:
pip install openpyxl 

[0mNote: you may need to restart the kernel to use updated packages.


# Loading the Dataset

In [55]:
df=pd.read_excel('/kaggle/input/largest-2000-companies-in-the-world-by-revenue/Largest Companies in the World.xlsx')
#showing the dataset
df

Unnamed: 0,Global Rank,Company,Sales ($billion),Profits ($billion),Assets ($billion),Market Value ($billion),Country,Continent,Latitude,Longitude
0,1.0,ICBC,134.8,37.8,2813.5,237.3,China,Asia,35.861660,104.195397
1,2.0,China Construction Bank,113.1,30.6,2241.0,202.0,China,Asia,35.861660,104.195397
2,3.0,JPMorgan Chase,108.2,21.3,2359.1,191.4,USA,North America,37.090240,-95.712891
3,4.0,General Electric,147.4,13.6,685.3,243.7,USA,North America,37.090240,-95.712891
4,5.0,Exxon Mobil,420.7,44.9,333.8,400.4,USA,North America,37.090240,-95.712891
...,...,...,...,...,...,...,...,...,...,...
1919,1995.0,Tractor Supply,4.7,0.3,1.7,7.1,USA,North America,37.090240,-95.712891
1920,1996.0,San-Ai Oil,0.5,0.1,25.7,0.5,Japan,Asia,36.204824,138.252924
1921,1996.0,UOL Group,0.9,0.7,7.8,4.2,Singapore,Asia,1.352083,103.819836
1922,1998.0,Interconexion Electrica,2.4,0.2,14.6,5.8,Colombia,South America,4.570868,-74.297333


# #Getting the Preliminary Information

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1924 entries, 0 to 1923
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Global Rank              1924 non-null   float64
 1   Company                  1924 non-null   object 
 2   Sales ($billion)         1924 non-null   float64
 3   Profits ($billion)       1924 non-null   float64
 4   Assets ($billion)        1924 non-null   float64
 5   Market Value ($billion)  1924 non-null   float64
 6   Country                  1924 non-null   object 
 7   Continent                1924 non-null   object 
 8   Latitude                 1924 non-null   float64
 9   Longitude                1924 non-null   float64
dtypes: float64(7), object(3)
memory usage: 150.4+ KB


# Checking for Missing Values

In [57]:
df.isna().sum()

Global Rank                0
Company                    0
Sales ($billion)           0
Profits ($billion)         0
Assets ($billion)          0
Market Value ($billion)    0
Country                    0
Continent                  0
Latitude                   0
Longitude                  0
dtype: int64

# Creating the Preprocess Function

In [58]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [59]:
def preprocess_inputs(df):
    df=df.copy()
    #Dropping unused column
    
    df=df.drop(['Global Rank','Company'],axis=1)
    
    
    onehot_columns=['Country','Continent']
    
    df=onehot_encode(df,onehot_columns)
    y=df['Market Value ($billion)']
    x=df.drop('Market Value ($billion)',axis=1)
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    
    
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
    
    
    
    return x_train,x_test,y_train,y_test

In [60]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1346, 71)
(578, 71)
(1346,)
(578,)


# Compiling the Neural Network Model

In [61]:
inputs=tf.keras.Input(shape=(71,))
x=tf.keras.layers.Dense(128,activation='relu')(inputs)
x=tf.keras.layers.Dense(128,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='linear')(x)



model=tf.keras.Model(inputs=inputs,outputs=outputs)
model.compile(optimizer='adam',
             loss='mse')

In [62]:
history=model.fit(x_train,y_train,validation_split=0.2,batch_size=32,
                 epochs=100,callbacks=[tf.keras.callbacks.EarlyStopping(
                 monitor='val_loss',patience=3,restore_best_weights=True)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [63]:
y_pred=np.squeeze(model.predict(x_test))



In [64]:
print('Neural Network',np.sqrt(np.mean(((y_pred-y_test)**2))))

Neural Network 19.523783220367037


In [65]:
model.evaluate(x_test,y_test)



381.17816162109375

In [66]:
from sklearn.ensemble import RandomForestRegressor
tf_model=RandomForestRegressor()
tf_model.fit(x_train,y_train)
y_pred_rf=tf_model.predict(x_test)
print('Random Forest',np.sqrt(np.mean(((y_pred_rf-y_test)**2))))

Random Forest 18.937588917237324


In [69]:
fig=px.scatter(x=y_pred,y=y_test,
              labels={'x':'Predicted','y':'Actual'},
              title='Actual vs Predicted Values',
              width=700,
              height=700)
fig.show()

In [70]:
fig=px.scatter(x=y_pred_rf,y=y_test,
              labels={'x':'Predicted','y':'Actual'},
              title='Actual vs Predicted Values',
              width=700,
              height=700)
fig.show()