<a href="https://colab.research.google.com/github/tharushaliyanagama/OralCancerEarlyDetection-DSGP/blob/Prediction-of-textual-data-I/data_preprocessing_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1.Import libraries**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# **2. Load and Inspect the Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/DSGP/Siyumi/Modelnew/oral_cancer_prediction_dataset (1).csv')

In [None]:
#Display information
print("Dataset Info:")
print(data.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160292 entries, 0 to 160291
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ID                    160292 non-null  int64  
 1   Country               160292 non-null  object 
 2   Gender                160292 non-null  object 
 3   Age                   160292 non-null  int64  
 4   Tobacco_Use           160292 non-null  int64  
 5   Alcohol_Use           160292 non-null  int64  
 6   Socioeconomic_Status  160292 non-null  object 
 7   Diagnosis_Stage       160292 non-null  object 
 8   Treatment_Type        160292 non-null  object 
 9   Survival_Rate         160292 non-null  float64
 10  HPV_Related           160292 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 13.5+ MB
None


In [None]:
#Display shape of the Dataset
print("\nDataset Shape:")
print(data.shape)


Dataset Shape:
(160292, 11)


In [None]:
#Load 5 rows
print("\nFirst 5 rows:")
print(data.head())


First 5 rows:
   ID   Country  Gender  Age  Tobacco_Use  Alcohol_Use Socioeconomic_Status  \
0   1  Ethiopia    Male   34            1            1                 High   
1   2    Turkey  Female   84            1            1                 High   
2   3    Turkey  Female   62            1            1               Middle   
3   4  Tanzania    Male   48            1            1               Middle   
4   5    France    Male   26            1            1               Middle   

  Diagnosis_Stage Treatment_Type  Survival_Rate  HPV_Related  
0           Early   Radiotherapy       0.826235            0  
1        Moderate   Radiotherapy       0.376607            0  
2           Early   Radiotherapy       0.736296            1  
3        Moderate    Combination       0.786118            0  
4           Early   Radiotherapy       0.830411            0  


In [None]:
# Irrelevant for prediction
columns_to_drop = ['ID','Treatment_Type','Survival_Rate']
print(f"\nDropping columns: {columns_to_drop}")
data = data.drop(columns=columns_to_drop)


Dropping columns: ['ID', 'Treatment_Type', 'Survival_Rate']


In [None]:
# Select numeric variables
numeric_columns = data.select_dtypes(include=['int64', 'float64'])

# Print numeric variables
print("Numeric Variables:")
print(numeric_columns.head())

Numeric Variables:
   Age  Tobacco_Use  Alcohol_Use  HPV_Related
0   34            1            1            0
1   84            1            1            0
2   62            1            1            1
3   48            1            1            0
4   26            1            1            0


In [None]:
#Select categorical variables
categorical_columns = data.select_dtypes(include=['object'])

# Print categorical variables
print("\nCategorical Variables:")
print(categorical_columns.head())


Categorical Variables:
    Country  Gender Socioeconomic_Status Diagnosis_Stage
0  Ethiopia    Male                 High           Early
1    Turkey  Female                 High        Moderate
2    Turkey  Female               Middle           Early
3  Tanzania    Male               Middle        Moderate
4    France    Male               Middle           Early


# **Dataset Preprocessing**

In [None]:
# Identifying null values
print("\nNull Values:")
print(data.isnull().sum())


Null Values:
Country                 0
Gender                  0
Age                     0
Tobacco_Use             0
Alcohol_Use             0
Socioeconomic_Status    0
Diagnosis_Stage         0
HPV_Related             0
dtype: int64


In [None]:
#Replace missing numerical features by mean
mean_imputer = SimpleImputer(strategy='mean')
data[numeric_columns.columns] = mean_imputer.fit_transform(data[numeric_columns.columns])

#Replace missing categorical features by mode
mode_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_columns.columns] = mode_imputer.fit_transform(data[categorical_columns.columns])

In [None]:
# Remove duplicate rows where all column values are the same
data = data.drop_duplicates()

# Confirm removal
print(f"\nNumber of duplicate rows after removal: {data.duplicated().sum()}")


Number of duplicate rows after removal: 0


In [None]:
data.shape

(105645, 8)

# **Column wise Preprocessing**

In [None]:
#Unique values
for column in data.columns:
    unique_values = data[column].unique()
    print(f"\nUnique values in column '{column}':")
    print(unique_values)


Unique values in column 'Country':
['Ethiopia' 'Turkey' 'Tanzania' 'France' 'China' 'Colombia' 'Japan'
 'Nigeria' 'Brazil' 'Kenya' 'United Kingdom' 'Germany' 'Myanmar'
 'Philippines' 'Russia' 'Thailand' 'Mexico' 'Bangladesh' 'Iran'
 'United States' 'South Africa' 'DR Congo' 'Spain' 'Egypt' 'Italy'
 'Indonesia' 'Pakistan' 'India' 'South Korea' 'Vietnam']

Unique values in column 'Gender':
['Male' 'Female']

Unique values in column 'Age':
[34. 84. 62. 48. 26. 22. 76. 49. 60. 20. 86. 72. 28. 27. 31. 41. 36. 50.
 25. 37. 67. 40. 21. 70. 23. 57. 24. 39. 80. 30. 55. 32. 43. 66. 83. 79.
 81. 35. 33. 69. 88. 71. 59. 75. 51. 52. 78. 38. 65. 44. 29. 64. 82. 42.
 87. 56. 73. 53. 58. 77. 61. 54. 63. 74. 45. 68. 46. 85. 47. 89.]

Unique values in column 'Tobacco_Use':
[1. 0.]

Unique values in column 'Alcohol_Use':
[1. 0.]

Unique values in column 'Socioeconomic_Status':
['High' 'Middle' 'Low']

Unique values in column 'Diagnosis_Stage':
['Early' 'Moderate' 'Late']

Unique values in column 'HPV_Re

In [None]:
#Country column Analysis
country_counts = data['Country'].value_counts()
print("\nCountry Value Counts:")
print(country_counts)


Country Value Counts:
Country
Germany           3592
Iran              3579
Turkey            3571
India             3568
Vietnam           3564
Pakistan          3551
Ethiopia          3550
Colombia          3536
Italy             3534
United Kingdom    3529
South Korea       3526
United States     3525
China             3524
Japan             3524
Tanzania          3523
Brazil            3522
Russia            3520
Nigeria           3519
Kenya             3517
Philippines       3516
Bangladesh        3512
South Africa      3510
France            3500
Thailand          3494
Indonesia         3492
DR Congo          3485
Spain             3474
Mexico            3473
Myanmar           3471
Egypt             3444
Name: count, dtype: int64


In [None]:
# Define a mapping of countries to continents
continent_map = {
    'Germany': 'Europe', 'United Kingdom': 'Europe', 'Italy': 'Europe', 'France': 'Europe', 'Spain': 'Europe',
    'Russia': 'Europe', 'Turkey': 'Europe',

    'Vietnam': 'Asia', 'Philippines': 'Asia', 'Pakistan': 'Asia', 'Bangladesh': 'Asia', 'South Korea': 'Asia',
    'China': 'Asia', 'Japan': 'Asia', 'India': 'Asia', 'Myanmar': 'Asia', 'Thailand': 'Asia', 'Indonesia': 'Asia',
    'Iran': 'Asia',

    'Colombia': 'South America', 'Brazil': 'South America', 'Mexico': 'North America',

    'Kenya': 'Africa', 'Nigeria': 'Africa', 'DR Congo': 'Africa', 'Ethiopia': 'Africa', 'South Africa': 'Africa',
    'Tanzania': 'Africa', 'Egypt': 'Africa',

    'United States': 'North America',

    'Egypt': 'Africa'
}

# Create a new 'Continent' column
data['Continent'] = data['Country'].map(continent_map)

# Count occurrences per continent
continent_counts = data['Continent'].value_counts()

# Display results
print("\nContinent Value Counts:")
print(continent_counts)


Continent Value Counts:
Continent
Asia             42321
Europe           24720
Africa           24548
South America     7058
North America     6998
Name: count, dtype: int64


In [None]:
data.head()

Unnamed: 0,Country,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,Diagnosis_Stage,HPV_Related,Continent
0,Ethiopia,Male,34.0,1.0,1.0,High,Early,0.0,Africa
1,Turkey,Female,84.0,1.0,1.0,High,Moderate,0.0,Europe
2,Turkey,Female,62.0,1.0,1.0,Middle,Early,1.0,Europe
3,Tanzania,Male,48.0,1.0,1.0,Middle,Moderate,0.0,Africa
4,France,Male,26.0,1.0,1.0,Middle,Early,0.0,Europe


In [None]:
#Drop Country column
data = data.drop(columns=['Country'])

In [None]:
data.head()

Unnamed: 0,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,Diagnosis_Stage,HPV_Related,Continent
0,Male,34.0,1.0,1.0,High,Early,0.0,Africa
1,Female,84.0,1.0,1.0,High,Moderate,0.0,Europe
2,Female,62.0,1.0,1.0,Middle,Early,1.0,Europe
3,Male,48.0,1.0,1.0,Middle,Moderate,0.0,Africa
4,Male,26.0,1.0,1.0,Middle,Early,0.0,Europe


**Gender Column**

In [None]:

#Gender column Analysis
data['Gender'].value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,65479
Female,40166


**Socioeconomic_Status column Analysis**

In [None]:
#Socioeconomic_Status column Analysis
data['Socioeconomic_Status'].value_counts()

Unnamed: 0_level_0,count
Socioeconomic_Status,Unnamed: 1_level_1
Low,47499
Middle,38101
High,20045


# **Save Preprocess dataset**

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the correct directory and filename
folder_path = "/content/drive/MyDrive/DSGP/Siyumi/Modelnew"
file_name = "preprocessed_new_dataset.csv"
full_path = os.path.join(folder_path, file_name)

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Save the dataset
data.to_csv(full_path, index=False)

print(f"Preprocessed dataset saved to: {full_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Preprocessed dataset saved to: /content/drive/MyDrive/DSGP/Siyumi/Modelnew/preprocessed_new_dataset_2.csv
