<a href="https://colab.research.google.com/github/tharushaliyanagama/OralCancerEarlyDetection-DSGP/blob/Prediction-of-textual-data-I/data_preprocessing_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1.Import libraries**

In [74]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# **2. Load and Inspect the Dataset**

In [75]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [76]:
data = pd.read_csv('/content/drive/MyDrive/DSGP/Siyumi/Modelnew/new/oral_cancer_prediction_dataset.csv')

In [77]:
#Display information
print("Dataset Info:")
print(data.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160292 entries, 0 to 160291
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ID                    160292 non-null  int64  
 1   Country               160292 non-null  object 
 2   Gender                160292 non-null  object 
 3   Age                   160292 non-null  int64  
 4   Tobacco_Use           160292 non-null  int64  
 5   Alcohol_Use           160292 non-null  int64  
 6   Socioeconomic_Status  160292 non-null  object 
 7   Diagnosis_Stage       160292 non-null  object 
 8   Treatment_Type        160292 non-null  object 
 9   Survival_Rate         160292 non-null  float64
 10  HPV_Related           160292 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 13.5+ MB
None


In [78]:
#Display shape of the Dataset
print("\nDataset Shape:")
print(data.shape)


Dataset Shape:
(160292, 11)


In [79]:
#Load 5 rows
print("\nFirst 5 rows:")
print(data.head())


First 5 rows:
   ID   Country  Gender  Age  Tobacco_Use  Alcohol_Use Socioeconomic_Status  \
0   1  Ethiopia    Male   34            1            1                 High   
1   2    Turkey  Female   84            1            1                 High   
2   3    Turkey  Female   62            1            1               Middle   
3   4  Tanzania    Male   48            1            1               Middle   
4   5    France    Male   26            1            1               Middle   

  Diagnosis_Stage Treatment_Type  Survival_Rate  HPV_Related  
0           Early   Radiotherapy       0.826235            0  
1        Moderate   Radiotherapy       0.376607            0  
2           Early   Radiotherapy       0.736296            1  
3        Moderate    Combination       0.786118            0  
4           Early   Radiotherapy       0.830411            0  


**Drop Unwanted columns**

In [80]:
# Irrelevant for prediction
columns_to_drop = ['ID','Treatment_Type','Survival_Rate','Diagnosis_Stage']
print(f"\nDropping columns: {columns_to_drop}")
data = data.drop(columns=columns_to_drop)


Dropping columns: ['ID', 'Treatment_Type', 'Survival_Rate', 'Diagnosis_Stage']


In [81]:
# Select numeric variables
numeric_columns = data.select_dtypes(include=['int64', 'float64'])

# Print numeric variables
print("Numeric Variables:")
print(numeric_columns.head())

Numeric Variables:
   Age  Tobacco_Use  Alcohol_Use  HPV_Related
0   34            1            1            0
1   84            1            1            0
2   62            1            1            1
3   48            1            1            0
4   26            1            1            0


In [82]:
#Select categorical variables
categorical_columns = data.select_dtypes(include=['object'])

# Print categorical variables
print("\nCategorical Variables:")
print(categorical_columns.head())


Categorical Variables:
    Country  Gender Socioeconomic_Status
0  Ethiopia    Male                 High
1    Turkey  Female                 High
2    Turkey  Female               Middle
3  Tanzania    Male               Middle
4    France    Male               Middle


# **Dataset Preprocessing**

**Check Null values**

In [83]:
# Identifying null values
print("\nNull Values:")
print(data.isnull().sum())


Null Values:
Country                 0
Gender                  0
Age                     0
Tobacco_Use             0
Alcohol_Use             0
Socioeconomic_Status    0
HPV_Related             0
dtype: int64


**Handle Missing values**

In [84]:
#Replace missing numerical features by mean
mean_imputer = SimpleImputer(strategy='mean')
data[numeric_columns.columns] = mean_imputer.fit_transform(data[numeric_columns.columns])

#Replace missing categorical features by mode
mode_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_columns.columns] = mode_imputer.fit_transform(data[categorical_columns.columns])

**Handle Duplicates**

In [85]:
#Duplicates count
print("\nNumber of duplicate rows:", data.duplicated().sum())


Number of duplicate rows: 95639


In [86]:
#Remove Duplicates
data = data.drop_duplicates()

In [87]:
data.shape

(64653, 7)

# **Column wise Preprocessing**

In [88]:
#Unique values
for column in data.columns:
    unique_values = data[column].unique()
    print(f"\nUnique values in column '{column}':")
    print(unique_values)


Unique values in column 'Country':
['Ethiopia' 'Turkey' 'Tanzania' 'France' 'China' 'Colombia' 'Japan'
 'Nigeria' 'Brazil' 'Kenya' 'United Kingdom' 'Germany' 'Myanmar'
 'Philippines' 'Russia' 'Thailand' 'Mexico' 'Bangladesh' 'Iran'
 'United States' 'South Africa' 'DR Congo' 'Spain' 'Egypt' 'Italy'
 'Indonesia' 'Pakistan' 'India' 'South Korea' 'Vietnam']

Unique values in column 'Gender':
['Male' 'Female']

Unique values in column 'Age':
[34. 84. 62. 48. 26. 22. 76. 49. 60. 20. 86. 72. 28. 27. 31. 41. 36. 50.
 25. 37. 67. 40. 21. 70. 23. 57. 24. 39. 80. 30. 55. 32. 43. 66. 83. 79.
 81. 35. 33. 69. 88. 71. 59. 75. 51. 52. 78. 38. 65. 44. 29. 64. 82. 42.
 87. 56. 73. 53. 58. 77. 61. 54. 63. 74. 45. 68. 46. 85. 47. 89.]

Unique values in column 'Tobacco_Use':
[1. 0.]

Unique values in column 'Alcohol_Use':
[1. 0.]

Unique values in column 'Socioeconomic_Status':
['High' 'Middle' 'Low']

Unique values in column 'HPV_Related':
[0. 1.]


**Country Column**

In [89]:
#Country column Analysis
country_counts = data['Country'].value_counts()
print("\nCountry Value Counts:")
print(country_counts)


Country Value Counts:
Country
Germany           2212
Vietnam           2206
Colombia          2197
Kenya             2184
Iran              2180
Philippines       2180
Pakistan          2173
Russia            2166
Turkey            2166
Nigeria           2160
Bangladesh        2160
DR Congo          2159
South Korea       2156
United Kingdom    2156
Ethiopia          2154
China             2154
Japan             2153
Brazil            2148
India             2148
South Africa      2144
Italy             2140
Myanmar           2139
United States     2139
Thailand          2138
Mexico            2135
Tanzania          2133
Indonesia         2131
France            2130
Egypt             2107
Spain             2105
Name: count, dtype: int64


In [90]:
# Define a mapping of countries to continents
continent_map = {
    'Germany': 'Europe', 'United Kingdom': 'Europe', 'Italy': 'Europe', 'France': 'Europe', 'Spain': 'Europe',
    'Russia': 'Europe', 'Turkey': 'Europe',

    'Vietnam': 'Asia', 'Philippines': 'Asia', 'Pakistan': 'Asia', 'Bangladesh': 'Asia', 'South Korea': 'Asia',
    'China': 'Asia', 'Japan': 'Asia', 'India': 'Asia', 'Myanmar': 'Asia', 'Thailand': 'Asia', 'Indonesia': 'Asia',
    'Iran': 'Asia',

    'Colombia': 'South America', 'Brazil': 'South America', 'Mexico': 'North America',

    'Kenya': 'Africa', 'Nigeria': 'Africa', 'DR Congo': 'Africa', 'Ethiopia': 'Africa', 'South Africa': 'Africa',
    'Tanzania': 'Africa', 'Egypt': 'Africa',

    'United States': 'North America',

    'Egypt': 'Africa'
}

# Create a new 'Continent' column
data['Continent'] = data['Country'].map(continent_map)

# Count occurrences per continent
continent_counts = data['Continent'].value_counts()

# Display results
print("\nContinent Value Counts:")
print(continent_counts)


Continent Value Counts:
Continent
Asia             25918
Europe           15075
Africa           15041
South America     4345
North America     4274
Name: count, dtype: int64


In [91]:
data.head()

Unnamed: 0,Country,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,HPV_Related,Continent
0,Ethiopia,Male,34.0,1.0,1.0,High,0.0,Africa
1,Turkey,Female,84.0,1.0,1.0,High,0.0,Europe
2,Turkey,Female,62.0,1.0,1.0,Middle,1.0,Europe
3,Tanzania,Male,48.0,1.0,1.0,Middle,0.0,Africa
4,France,Male,26.0,1.0,1.0,Middle,0.0,Europe


In [92]:
#Drop Country column
data = data.drop(columns=['Country'])

In [93]:
data.head()

Unnamed: 0,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,HPV_Related,Continent
0,Male,34.0,1.0,1.0,High,0.0,Africa
1,Female,84.0,1.0,1.0,High,0.0,Europe
2,Female,62.0,1.0,1.0,Middle,1.0,Europe
3,Male,48.0,1.0,1.0,Middle,0.0,Africa
4,Male,26.0,1.0,1.0,Middle,0.0,Europe


**Gender Column**

In [94]:
#Gender column Analysis
data['Gender'].value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,37156
Female,27497


**Age Column**

In [95]:
#Age column Analysis
data['Age'].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
32.0,1198
36.0,1197
33.0,1195
26.0,1195
22.0,1194
...,...
81.0,802
52.0,802
56.0,800
57.0,799


In [96]:
#categorize age into groups
def categorize_age(age):
    if age <=40:
        return 'Young'
    elif 40 < age <= 60:
        return 'Middle'
    else:
        return 'Older'

# Apply the function to the 'Age' column
data['Age_group'] = data['Age'].apply(categorize_age)

In [97]:
data.head()

Unnamed: 0,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,HPV_Related,Continent,Age_group
0,Male,34.0,1.0,1.0,High,0.0,Africa,Young
1,Female,84.0,1.0,1.0,High,0.0,Europe,Older
2,Female,62.0,1.0,1.0,Middle,1.0,Europe,Older
3,Male,48.0,1.0,1.0,Middle,0.0,Africa,Middle
4,Male,26.0,1.0,1.0,Middle,0.0,Europe,Young


In [98]:
#Drop Age column
data = data.drop(columns=['Age'])

In [99]:
data.head()

Unnamed: 0,Gender,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,HPV_Related,Continent,Age_group
0,Male,1.0,1.0,High,0.0,Africa,Young
1,Female,1.0,1.0,High,0.0,Europe,Older
2,Female,1.0,1.0,Middle,1.0,Europe,Older
3,Male,1.0,1.0,Middle,0.0,Africa,Middle
4,Male,1.0,1.0,Middle,0.0,Europe,Young


**Socioeconomic_Status column Analysis**

In [100]:
#Socioeconomic_Status column Analysis
data['Socioeconomic_Status'].value_counts()

Unnamed: 0_level_0,count
Socioeconomic_Status,Unnamed: 1_level_1
Low,26286
Middle,23266
High,15101


In [101]:
#print unique values for all the columns
print(data['Gender'].unique())
print(data['Continent'].unique())
print(data['Tobacco_Use'].unique())
print(data['Alcohol_Use'].unique())
print(data['HPV_Related'].unique())
print(data['Age_group'].unique())
print(data['Socioeconomic_Status'].unique())

['Male' 'Female']
['Africa' 'Europe' 'Asia' 'South America' 'North America']
[1. 0.]
[1. 0.]
[0. 1.]
['Young' 'Older' 'Middle']
['High' 'Middle' 'Low']


**Decoded numerical colums into categorical**

In [102]:
#add 1 -> Yes and 0-> No in Tobacco_use
data['Tobacco_Use'] = data['Tobacco_Use'].replace({1.0: 'Yes', 0.0: 'No'})
#in Alcohol Use
data['Alcohol_Use'] = data['Alcohol_Use'].replace({1.0: 'Yes', 0.0: 'No'})
#in HPV_Related
data['HPV_Related'] = data['HPV_Related'].replace({1.0: 'Yes', 0.0: 'No'})

In [103]:
data.head()

Unnamed: 0,Gender,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,HPV_Related,Continent,Age_group
0,Male,Yes,Yes,High,No,Africa,Young
1,Female,Yes,Yes,High,No,Europe,Older
2,Female,Yes,Yes,Middle,Yes,Europe,Older
3,Male,Yes,Yes,Middle,No,Africa,Middle
4,Male,Yes,Yes,Middle,No,Europe,Young


In [104]:
#one hot encoding
data = pd.get_dummies(data, columns=['Age_group','Gender', 'Continent','Socioeconomic_Status','Tobacco_Use','Alcohol_Use','HPV_Related'])

In [105]:
data.head()

Unnamed: 0,Age_group_Middle,Age_group_Older,Age_group_Young,Gender_Female,Gender_Male,Continent_Africa,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Socioeconomic_Status_High,Socioeconomic_Status_Low,Socioeconomic_Status_Middle,Tobacco_Use_No,Tobacco_Use_Yes,Alcohol_Use_No,Alcohol_Use_Yes,HPV_Related_No,HPV_Related_Yes
0,False,False,True,False,True,True,False,False,False,False,True,False,False,False,True,False,True,True,False
1,False,True,False,True,False,False,False,True,False,False,True,False,False,False,True,False,True,True,False
2,False,True,False,True,False,False,False,True,False,False,False,False,True,False,True,False,True,False,True
3,True,False,False,False,True,True,False,False,False,False,False,False,True,False,True,False,True,True,False
4,False,False,True,False,True,False,False,True,False,False,False,False,True,False,True,False,True,True,False


# **Save Preprocess dataset**

In [106]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the correct directory and filename
folder_path = "/content/drive/MyDrive/DSGP/Siyumi/Modelnew/new"
file_name = "preprocessed_new_dataset.csv"
full_path = os.path.join(folder_path, file_name)

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Save the dataset
data.to_csv(full_path, index=False)

print(f"Preprocessed dataset saved to: {full_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Preprocessed dataset saved to: /content/drive/MyDrive/DSGP/Siyumi/Modelnew/new/preprocessed_new_dataset.csv
