In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load the dataset
dataset_directory = "LinkedIn_Dataset.pcl"  # Change this according to your directory
dataset = pd.read_pickle(dataset_directory)

# Inspect the dataset
print(dataset.head())
print(dataset.info())

                                               Intro            Full Name  \
0  {'Full Name': 'chenxia (polly) Pei', 'Workplac...  chenxia (polly) Pei   
1  {'Full Name': 'NEHA CHANDOK', 'Workplace': 'So...         NEHA CHANDOK   
2  {'Full Name': 'Mounika Mungamuri', 'Workplace'...    Mounika Mungamuri   
3  {'Full Name': 'Katarina Djuric', 'Workplace': ...      Katarina Djuric   
4  {'Full Name': 'Rachel Lally', 'Workplace': '--...         Rachel Lally   

                                           Workplace  \
0  Jiangsu Junyao mainly offer services to cement...   
1                                   Software Analyst   
2                       Senior Consultant at Infosys   
3                                                 --   
4                                                 --   

                         Location Connections Photo Followers  \
0            Wuxi, Jiangsu, China         500    No       717   
1     Noida, Uttar Pradesh, India         500    No      1340   
2    

In [3]:
# Convert dict type columns to string
for col in dataset.columns:
    if dataset[col].apply(lambda x: isinstance(x, dict)).any():
        dataset[col] = dataset[col].apply(str)

# Fill missing values for categorical columns
dataset['Full Name'] = dataset['Full Name'].fillna('Unknown')
dataset['Workplace'] = dataset['Workplace'].fillna('Unknown')
dataset['Location'] = dataset['Location'].fillna('Unknown')

# Identify non-numeric columns
non_numeric_columns = dataset.select_dtypes(include=['object']).columns

# One-hot encode non-numeric columns
dataset = pd.get_dummies(dataset, columns=non_numeric_columns, drop_first=True)

In [4]:
X = dataset.drop(columns=['Label'])
y = dataset['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# Initialize and train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [7]:
# Make predictions
y_pred = clf.predict(X_test)


In [8]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9638888888888889
Confusion Matrix:
 [[344   5   0   0]
 [ 16 131   0   0]
 [  0   0 114   3]
 [  0   1   1 105]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97       349
           1       0.96      0.89      0.92       147
          10       0.99      0.97      0.98       117
          11       0.97      0.98      0.98       107

    accuracy                           0.96       720
   macro avg       0.97      0.96      0.96       720
weighted avg       0.96      0.96      0.96       720



In [9]:
# Feature importance
importances = clf.feature_importances_
feature_names = X.columns
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print("Feature Importances:\n", feature_importances)

Feature Importances:
 Workplace_Unknown                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 