# Import libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize

# Import data

In [None]:
# Pan Cancer dataframe
pan_cancer_df = pd.DataFrame()

# Cancer types
cancers = ['kich', 'acc', 'blca', 'brca', 'cesc', 'esca', 'kirk', 'laml', 'lgg', 'lich', 'ov', 'paad', 'prad', 'read', 'tgct', 'thca']
cancers_ = ['luad', 'ucec', 'coad', 'skcm']

# Import data
# 100%
for cancer in cancers:
  data = pd.read_csv(f'/content/{cancer}.csv')
  data['project_id'] = cancer.upper()

  pan_cancer_df = pd.concat([pan_cancer_df, data], ignore_index=True)

# 50%
for cancer in cancers_:
  data = pd.read_csv(f'/content/{cancer}.csv')
  data['project_id'] = cancer.upper()
  rows_fifty_perc = int(0.5 * len(data))
  data_fifty_perc = data.sample(n=rows_fifty_perc, random_state=42)
  pan_cancer_df = pd.concat([pan_cancer_df, data_fifty_perc], ignore_index=True)

In [None]:
#pan_cancer_df.to_csv('/content/pan_cancer_data.csv')
#pan_cancer_df = pd.read_csv('/content/pan_cancer_data.csv')

## Shape of cancer data

In [None]:
# Shape
pan_cancer_df.shape

## Dataframe info

In [None]:
# Dataframe info
pan_cancer_df.info()

# Data Preprocessing I

## Rename columns

In [None]:
# Columns to lowercase
pan_cancer_df.columns = pan_cancer_df.columns.str.lower()

# Rename columns
pan_cancer_df.rename(columns={'#"chrom"':'chrom', 'project_id':'cancer_type', 'variant_classification':'variant', 'matched_norm_sample_barcode':'barcode'}, inplace=True)

## Reject records

In [None]:
# Multiple sample counts have multiple gender values
pan_cancer_df['gender'].nunique()

In [None]:
# Get single sample count records only
pan_cancer_df = pan_cancer_df[pan_cancer_df['samplecount'] == 1]

# Check gender values
pan_cancer_df['gender'].unique()

## Replacing '--' with nulls

In [None]:
# Raw data has null values with dashes '--'
pan_cancer_df.head(1)

In [None]:
# Replace '--' with nan
pan_cancer_df.replace('--', np.nan, inplace=True)
pan_cancer_df.head(1)

## Check missing values

In [None]:
# Check missing values
pan_cancer_df.isnull().sum()

## Drop columns

In [None]:
# Drop based on null values
pan_cancer_df.drop(columns=['dbsnp_rs', 'dbsnp_val_status', 'days_to_death', 'cigarettes_per_day', 'weight', 'alcohol_history', 'alcohol_intensity', 'years_smoked', 'height', 'ethnicity', 'bmi'], inplace=True)

# Drop based on insignificance
pan_cancer_df.drop(columns=['case_id', 'reserved', 'blockcount', 'score', 'strand', 'chromstarts', 'samplecount', 'tumor_sample_barcode', 'entrez_gene_id'], inplace=True)


## Handle missing values

In [None]:
# Calculate the distribution of existing gender values
gender_distribution = pan_cancer_df['gender'].value_counts(normalize=True)

# Create a mask for null gender values
null_mask = pan_cancer_df['gender'].isnull()

# Fill null gender values with random genders based on the distribution
random_genders = np.random.choice(gender_distribution.index, size=null_mask.sum(), p=gender_distribution.values)
pan_cancer_df.loc[null_mask, 'gender'] = random_genders

# Check nulls after handling missing values
pan_cancer_df.isnull().sum()

In [None]:
#pan_cancer_df = pan_cancer_df[['chrom', 'chromstart', 'chromend', 'name', 'freq', 'variant', 'variant_type', 'tumor_seq_allele2', 'gender', 'cancer_type']]
#pan_cancer_df.to_csv('content/preprocessed_pan_cancer_data.csv')

# Data Analysis

## Total number of patients

In [None]:
# Total patients
pan_cancer_df['barcode'].nunique()

## Total cancer types

In [None]:
# Total cancer types
pan_cancer_df['cancer_type'].nunique()

## Data of each cancer type

In [None]:
# Data of each cancer type
cancer_counts = pan_cancer_df['cancer_type'].value_counts().sample(frac=1, random_state=42)

In [None]:
# Graph of number of records of each cancer types
plt.figure(figsize=(12, 6))
sns.barplot(x=cancer_counts.values, y=cancer_counts.index, orient='h', palette='viridis')
plt.xlabel('Number of records')
plt.ylabel('Cancer Types')
plt.title('Number of records of each cancer types')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()



```
UCEC - Uterine Corpus Endometrial Carcinoma
SKCM - Skin Cutaneous Melanoma
BLCA - Bladder Urothelial Carcinoma         
COAD - Colon Adenocarcinoma                 
BRCA - Breast Invasive Carcinoma            
LUAD - Lung Adenocarcinoma                  
CESC - Cervical and Endocervical Cancer     
OV   - Ovarian Serous Cystadenocarcinoma    
READ - Rectum Adenocarcinoma                
LICH - Liver Hepatocellular Carcinoma       
ESCA - Esophageal Carcinoma                 
LGG  - Brain Lower Grade Glioma             
PAAD - Pancreatic Adenocarcinoma            
PRAD - Prostate Adenocarcinoma              
KIRK - Kidney Renal Clear Cell Carcinoma    
ACC  - Adrenocortical Carcinoma             
THCA - Thyroid Carcinoma                    
LAML - Acute Myeloid Leukemia               
TGCT - Testicular Germ Cell Tumors          
KICH - Kidney Chromophobe

```

## Patients of each cancer type

In [None]:
# Patients of each cancer type
cancer_patient_counts = pan_cancer_df.groupby('cancer_type')['barcode'].nunique().sample(frac=1, random_state=42) #.sort_values(ascending=False)

In [None]:
# Graph of number of patients of each cancer types
plt.figure(figsize=(12, 6))
sns.barplot(x=cancer_patient_counts.index, y=cancer_patient_counts.values, palette='husl')
plt.xlabel('Cancer Types')
plt.ylabel('Number of Patients')
plt.title('Number of patients of each cancer types')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Total variants

In [None]:
# Total variants
variant_counts = pan_cancer_df['variant'].value_counts().sample(frac=1, random_state=42)

In [None]:
# Graph of variant counts
plt.figure(figsize=(12, 6))
sns.barplot(x=variant_counts.values, y=variant_counts.index, orient='h', palette='flare')
plt.xlabel('Number of variants')
plt.ylabel('Variant Types')
plt.title('Number of records of each variant types')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Different chroms

In [None]:
# Chrom counts
chrom_counts = pan_cancer_df['chrom'].value_counts().sample(frac=1, random_state=42)

In [None]:
# Graph of number of patients of each cancer types
plt.figure(figsize=(12, 6))
sns.barplot(x=chrom_counts.index, y=chrom_counts.values, palette='crest')
plt.xlabel('Cancer Types')
plt.ylabel('Number of Patients')
plt.title('Number of patients of each cancer types')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Variant type counts

In [None]:
# Variant type counts
variant_type_counts = pan_cancer_df['variant_type'].value_counts()

In [None]:
# Create a pie chart
plt.figure(figsize=(5, 5))
plt.pie(variant_type_counts, labels=variant_type_counts.index, autopct='%1.1f%%', startangle=140)

# Add a title (optional)
plt.title('Pie Chart of Variant Counts')

# Show the plot
plt.show()

## Gender distribution

In [None]:
# Gender distribution
gender_counts = pan_cancer_df['gender'].value_counts()

In [None]:
# Create a pie chart
plt.figure(figsize=(5,5))
plt.pie(gender_counts, labels=gender_counts.index)

# Add a title (optional)
plt.title('Pie Chart of Gender Counts')

# Show the plot
plt.show()

## Outliers analysis

In [None]:
## Calculate the IQR for the 'freq' column
#Q1 = pan_cancer_df['freq'].quantile(0.25)
#Q3 = pan_cancer_df['freq'].quantile(0.75)
#IQR = Q3 - Q1
#
## Define the lower and upper bounds
#lower_bound = Q1 - 1.5 * IQR
#upper_bound = Q3 + 1.5 * IQR
#
## Filter the DataFrame to remove outliers
#pan_cancer_df = pan_cancer_df[(pan_cancer_df['freq'] >= lower_bound) & (pan_cancer_df['freq'] <= upper_bound)]

In [None]:
## Calculate the IQR for the 'freq' column
#Q1 = pan_cancer_df['chromstart'].quantile(0.25)
#Q3 = pan_cancer_df['chromstart'].quantile(0.75)
#IQR = Q3 - Q1
#
## Define the lower and upper bounds
#lower_bound = Q1 - 1.5 * IQR
#upper_bound = Q3 + 1.5 * IQR
#
## Filter the DataFrame to remove outliers
#pan_cancer_df = pan_cancer_df[(pan_cancer_df['chromstart'] >= lower_bound) & (pan_cancer_df['chromstart'] <= upper_bound)]

In [None]:
## Create a boxplot for the 'freq' column
#plt.figure(figsize=(8, 6))
#sns.boxplot(data=pan_cancer_df, y='chromend')
#plt.title('Boxplot of freq Column')
#plt.show()

# Label encoding

In [None]:
# Label encoding
le = LabelEncoder()

# Iterate through columns and apply label encoding
for column in pan_cancer_df.columns:
    if pan_cancer_df[column].dtype == 'object':
        pan_cancer_df[column] = le.fit_transform(pan_cancer_df[column])

# Correlation matrix

In [None]:
# Correlation of 'cancer_type' with all features
pan_cancer_df.corr()['cancer_type']

In [None]:
# Create the heatmap using the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(pan_cancer_df.corr(), linewidths=0.5)
plt.title('Correlation Heatmap of Pan Cancer Features')
plt.show()

# Feature selection

In [None]:
# Select required features
#pan_cancer_df = pan_cancer_df[['chrom', 'chromstart', 'chromend', 'name', 'freq', 'variant', 'variant_type', 'tumor_seq_allele2', 'gender', 'cancer_type']]
pan_cancer_df.drop(columns='barcode', inplace=True)
pan_cancer_df.head()

# Split data

## Test/Train split

In [None]:
# Test/train dataframes
pan_cancer_train_df = pd.DataFrame()
pan_cancer_test_df = pd.DataFrame()

# Get unique barcodes
cancers = pan_cancer_df['cancer_type'].unique()

# Split each unique cancer data into train and test
for cancer in cancers:
    cancer_data = pan_cancer_df[pan_cancer_df['cancer_type'] == cancer]

    if len(cancer_data) > 1:
      train_set, test_set = train_test_split(cancer_data, test_size=0.2, random_state=42)
      pan_cancer_train_df = pd.concat([pan_cancer_train_df, train_set])
      pan_cancer_test_df = pd.concat([pan_cancer_test_df, test_set])
    else:
      pan_cancer_train_df = pd.concat([pan_cancer_train_df, train_set])

## Independent/Dependent variables in training set

In [None]:
# Split data into dependent/independent variables
X_train = pan_cancer_train_df.iloc[:, :-1].values
y_train = pan_cancer_train_df.iloc[:, -1].values

X_test = pan_cancer_test_df.iloc[:, :-1].values
y_test = pan_cancer_test_df.iloc[:, -1].values

# Feature scaling

In [None]:
# Scale dataset
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Train model

## Random Forest Classifier

In [None]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

## Support Vector Machine Classifier

In [None]:
# Support Vector Machine (SVM) Classifier
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train, y_train)

## XGBoost Classifier

In [None]:
# XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(objective="multi:softmax", num_class=20, n_estimators=100, random_state=42)
xgb_classifier.fit(X_train, y_train)

# Predict result

In [None]:
# Random Forest
rf_pred = rf_classifier.predict(X_test)

# SVM
svm_pred = svm_classifier.predict(X_test)

# XGBoost
xgb_pred = xgb_classifier.predict(X_test)

# Evaluate model

## Accuracy

In [None]:
# Random Forest
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Classifier Accuracy: {rf_accuracy}")

# SVM
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Classifier Accuracy: {svm_accuracy}")

# XGBoost
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f"XGBoost Classifier Accuracy: {xgb_accuracy}")

## Classification report

In [None]:
# Random Forest Classifier
rf_report = classification_report(y_test, rf_pred)
print("Random Forest Classifier Classification Report:\n")
print(rf_report)

# Support Vector Machine (SVM) Classifier
svm_report = classification_report(y_test, svm_pred)
print("\nSVM Classifier Classification Report:\n")
print(svm_report)

# XGBoost Classifier
xgb_report = classification_report(y_test, xgb_pred)
print("\nXGBoost Classifier Classification Report:\n")
print(xgb_report)

## F1 score

In [None]:
# Calculate the F1 score for each classifier
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
svm_f1 = f1_score(y_test, svm_pred, average='weighted')
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted')

# Print the F1 scores
print(f"Random Forest Classifier F1 Score: {rf_f1:.4f}")
print(f"SVM Classifier F1 Score: {svm_f1:.4f}")
print(f"XGBoost Classifier F1 Score: {xgb_f1:.4f}")

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for each classifier
rf_confusion_matrix = confusion_matrix(y_test, rf_pred)
svm_confusion_matrix = confusion_matrix(y_test, svm_pred)
xgb_confusion_matrix = confusion_matrix(y_test, xgb_pred)

# Create a visually appealing confusion matrix using seaborn
sns.set(font_scale=0.8)  # Adjust the font size for readability
sns.heatmap(rf_confusion_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted Negative", "Predicted Positive"],
            yticklabels=["Actual Negative", "Actual Positive"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")

## Precision-Recall curve

In [None]:
# Binarize the labels for multi-class classification
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

# Calculate and plot Precision-Recall curves for each class
plt.figure(figsize=(10, 6))
for class_label in range(20):
    # Calculate precision and recall for each classifier for the current class
    rf_precision, rf_recall, _ = precision_recall_curve(y_test_bin[:, class_label], rf_classifier.predict_proba(X_test)[:, class_label])
    svm_decision_function = svm_classifier.decision_function(X_test)
    svm_precision, svm_recall, _ = precision_recall_curve(y_test_bin[:, class_label], svm_decision_function[:, class_label])
    xgb_precision, xgb_recall, _ = precision_recall_curve(y_test_bin[:, class_label], xgb_classifier.predict_proba(X_test)[:, class_label])

    # Plot Precision-Recall curve for the current class
    plt.plot(rf_recall, rf_precision, label=f"Random Forest (Class {class_label})", color='b')
    plt.plot(svm_recall, svm_precision, label=f"SVM (Class {class_label})", color='g')
    plt.plot(xgb_recall, xgb_precision, label=f"XGBoost (Class {class_label})", color='r')

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (Multi-class)")
plt.legend(loc='best')
plt.grid()
plt.show()

## AUC/ROC curve

In [None]:
# Calculate ROC curve and AUC for Random Forest Classifier
rf_probs = rf_classifier.predict_proba(X_test)[:, 1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
rf_auc = roc_auc_score(y_test, rf_probs)

# Calculate ROC curve and AUC for SVM Classifier
svm_probs = svm_classifier.decision_function(X_test)
svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_probs)
svm_auc = roc_auc_score(y_test, svm_probs)

# Calculate ROC curve and AUC for XGBoost Classifier
xgb_probs = xgb_classifier.predict_proba(X_test)[:, 1]
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_probs)
xgb_auc = roc_auc_score(y_test, xgb_probs)

# Plot ROC curves
plt.figure(figsize=(10, 6))
plt.plot(rf_fpr, rf_tpr, label=f"Random Forest (AUC = {rf_auc:.2f})", color='b')
plt.plot(svm_fpr, svm_tpr, label=f"SVM (AUC = {svm_auc:.2f})", color='g')
plt.plot(xgb_fpr, xgb_tpr, label=f"XGBoost (AUC = {xgb_auc:.2f})", color='r')
plt.plot([0, 1], [0, 1], 'k--', color='gray', label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc='best')
plt.grid()
plt.show()

# Conclusion

# References