Genetic Dataset

Nount Google Drive in Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
# Example: Reading a CSV file from your Drive
path = "/content/drive/MyDrive/Spring2025/CBMF4761/CBMF4761Final/GSE96058-GPL11154_series_matrix.txt"

# Load the matrix (skip metadata lines that start with '!')
df = pd.read_csv(path , sep="\t", comment='!', index_col=0)

# Drop empty rows and columns
df.dropna(how="all", axis=0, inplace=True)
df.dropna(how="all", axis=1, inplace=True)

# Transpose to get samples as rows and genes as columns
df = df.T
df.head()


In [None]:
# Example approach to parse metadata (conceptual)
import pandas as pd
import re # Using regular expressions can be helpful

path = "/content/drive/MyDrive/CBMF4761Final/GSE2034_series_matrix.txt"
metadata = {}
current_sample_id = None

with open(path, 'r') as f:
    for line in f:
        if line.startswith("!Series_matrix_table_begin"):
            break # Stop when the actual data table begins
        if line.startswith("!Sample_geo_accession"):
            current_sample_id = line.strip().split('\t')[-1].strip('"')
            if current_sample_id not in metadata:
                metadata[current_sample_id] = {}
        elif line.startswith("!Sample_characteristics_ch1") and current_sample_id:
            # Example parsing: Assumes format like "!Sample_characteristics_ch1 = response: NR"
            # You might need to adjust parsing based on the exact format in the file
            parts = line.strip().split('=')
            if len(parts) > 1:
                char_line = parts[1].strip().strip('"')
                # Simple split on first colon - adjust if needed
                kv = char_line.split(':', 1)
                if len(kv) == 2:
                     key = kv[0].strip().lower().replace(" ", "_") # e.g. 'response', 'time_point'
                     value = kv[1].strip()
                     metadata[current_sample_id][key] = value

# Convert the dictionary to a DataFrame
metadata_df = pd.DataFrame.from_dict(metadata, orient='index')

print("Metadata extracted:")
print(metadata_df.head())
print("\nAvailable metadata columns:", metadata_df.columns)
# --- You might need to inspect metadata_df.columns and rename them ---
# --- e.g., metadata_df.rename(columns={'clinical_response': 'response'}, inplace=True) ---

Metadata extracted:
Empty DataFrame
Columns: []
Index: []

Available metadata columns: RangeIndex(start=0, stop=0, step=1)


In [None]:
# Print the first ~60 lines to inspect the header structure
print("--- File Header (First 60 lines) ---")
try:
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if i >= 60:
                break
            print(line.strip())
except FileNotFoundError:
    print(f"Error: File not found at {path}")
except Exception as e:
    print(f"Error reading file: {e}")
print("--- End of Header ---")

# --- Now, MANUALLY EXAMINE the output above ---
# Look for lines like:
# !Sample_geo_accession = "GSMxxxxx"
# !Sample_title = "..." (Sometimes contains info)
# !Sample_source_name_ch1 = "..."
# !Sample_organism_ch1 = "..."
# !Sample_characteristics_ch1 = "response: NR"  <-- Look carefully at THIS line's format
# !Sample_characteristics_ch1 = "time point: pre-treatment" <-- And this one
# Note the exact text before the colon (e.g., 'response', 'time point', 'clinical outcome')
# Note the separator (is it a colon ':', an equals sign '=', etc.?)

--- File Header (First 60 lines) ---
!Series_title	"Breast cancer relapse free survival"
!Series_geo_accession	"GSE2034"
!Series_status	"Public on Feb 23 2005"
!Series_submission_date	"Dec 03 2004"
!Series_last_update_date	"Aug 10 2018"
!Series_pubmed_id	"15721472"
!Series_summary	"This series represents 180 lymph-node negative relapse free patients and 106 lymph-node negate patients that developed a distant metastasis."
!Series_summary	"Please see attached patient clinical parameters sheet for more information."
!Series_summary	"Keywords: other"
!Series_type	"Expression profiling by array"
!Series_sample_id	"GSM36777 GSM36778 GSM36779 GSM36780 GSM36781 GSM36782 GSM36783 GSM36784 GSM36785 GSM36786 GSM36787 GSM36788 GSM36789 GSM36790 GSM36791 GSM36792 GSM36793 GSM36794 GSM36795 GSM36796 GSM36797 GSM36798 GSM36799 GSM36800 GSM36801 GSM36802 GSM36803 GSM36804 GSM36805 GSM36806 GSM36807 GSM36808 GSM36809 GSM36810 GSM36811 GSM36812 GSM36813 GSM36814 GSM36815 GSM36816 GSM36817 GSM36818 GSM36

In [None]:
print("Expression matrix shape (samples, genes):", df.shape)


Expression matrix shape (samples, genes): (286, 22283)


In [None]:
with open("/content/drive/MyDrive/Spring2025/CBMF4761/CBMF4761Final/GSE2034_series_matrix.txt", 'r') as f:
    # Read first 20 lines of the file to understand its structure
    lines = f.readlines()[:20]
    print(lines)

['!Series_title\t"Breast cancer relapse free survival"\n', '!Series_geo_accession\t"GSE2034"\n', '!Series_status\t"Public on Feb 23 2005"\n', '!Series_submission_date\t"Dec 03 2004"\n', '!Series_last_update_date\t"Aug 10 2018"\n', '!Series_pubmed_id\t"15721472"\n', '!Series_summary\t"This series represents 180 lymph-node negative relapse free patients and 106 lymph-node negate patients that developed a distant metastasis."\n', '!Series_summary\t"Please see attached patient clinical parameters sheet for more information."\n', '!Series_summary\t"Keywords: other"\n', '!Series_type\t"Expression profiling by array"\n', '!Series_sample_id\t"GSM36777 GSM36778 GSM36779 GSM36780 GSM36781 GSM36782 GSM36783 GSM36784 GSM36785 GSM36786 GSM36787 GSM36788 GSM36789 GSM36790 GSM36791 GSM36792 GSM36793 GSM36794 GSM36795 GSM36796 GSM36797 GSM36798 GSM36799 GSM36800 GSM36801 GSM36802 GSM36803 GSM36804 GSM36805 GSM36806 GSM36807 GSM36808 GSM36809 GSM36810 GSM36811 GSM36812 GSM36813 GSM36814 GSM36815 GSM368

In [None]:
# Read the metadata section to find the brain relapses info
with open("/content/drive/MyDrive/Spring2025/CBMF4761/CBMF4761Final/GSE2034_series_matrix.txt", 'r') as f:
    lines = f.readlines()

# Filter lines that contain "Brain relapses"
brain_lines = [line for line in lines if "brain relapses" in line.lower()]

# Extract labels for each sample (they might be in multiple lines for each sample)
labels = []
for line in brain_lines:
    # Split the line to get the relapse information
    label = line.strip().split(":")[-1].strip()  # Getting the 1/0 value after 'Brain relapses:'
    labels.append(int(label))  # Convert to int (1 or 0)

# Check the number of labels
print(f"Number of labels: {len(labels)}")

Number of labels: 0


In [None]:
# Check the number of samples in the expression matrix again
print(f"Number of samples in expression matrix: {df.shape[0]}")

Number of samples in expression matrix: 286


In [None]:
# Ensure labels match the number of samples
assert len(labels) == df.shape[0], "Number of labels doesn't match number of samples!"

# Add labels as a new column in the dataframe
df["label"] = labels

AssertionError: Number of labels doesn't match number of samples!

In [None]:
import pandas as pd

# Load the matrix, skipping metadata lines that start with "!"
df = pd.read_csv("GSE2034_series_matrix.txt", sep="\t", comment='!', index_col=0)

# Drop columns/rows with all NaN values (common in GEO files)
df.dropna(how="all", axis=0, inplace=True)
df.dropna(how="all", axis=1, inplace=True)

# Transpose to get samples as rows and genes as columns
df = df.T
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'GSE2034_series_matrix.txt'

In [None]:
pip install ucimlrepo # Import the BCW dataset

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
#Import the dataset into your code
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_original.data.features
y = breast_cancer_wisconsin_original.data.targets

# metadata
print(breast_cancer_wisconsin_original.metadata)

# variable information
print(breast_cancer_wisconsin_original.variables)

{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed fro

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def load_data(file_path, label_column):
    df = pd.read_csv(file_path)
    X = df.drop(label_column, axis=1)
    y = df[label_column]
    return X, y

def preprocess(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

def reduce_dimensions(X, method='tsne', n_components=2):
    if method == 'tsne':
        return TSNE(n_components=n_components, random_state=42).fit_transform(X)
    elif method == 'pca':
        return PCA(n_components=n_components).fit_transform(X)
    elif method == 'umap':
        return umap.UMAP(n_components=n_components, random_state=42).fit_transform(X)
    else:
        raise ValueError("Unsupported reduction method.")

def get_classifier(name='svm'):
    if name == 'svm':
        return SVC(kernel='rbf', C=1, gamma='scale')
    elif name == 'rf':
        return RandomForestClassifier(n_estimators=100, random_state=42)
    else:
        raise ValueError("Unsupported classifier.")

def evaluate_model(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", classification_report(y_true, y_pred))

def visualize(X_reduced, y):
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=X_reduced[:,0], y=X_reduced[:,1], hue=y, palette='coolwarm')
    plt.title("Dimensionality Reduction Visualization")
    plt.show()

def run_pipeline(data_path, label_column, reducer='tsne', classifier='svm'):
    X, y = load_data(data_path, label_column)
    X = preprocess(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_reduced = reduce_dimensions(X_train, method=reducer)
    visualize(X_reduced, y_train)

    clf = get_classifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    evaluate_model(y_test, y_pred)

# Example usage
run_pipeline('your_data.csv', label_column='Diagnosis', reducer='tsne', classifier='svm')


FileNotFoundError: [Errno 2] No such file or directory: 'your_data.csv'