In [52]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 10)
pd.set_option("display.width", 120)

In [53]:
labels_df = pd.read_csv("labels_real.csv")

labels_df["Sample"] = labels_df["Sample"].astype(str).str.strip()
labels_df = labels_df.set_index("Sample")

print("Labels shape:", labels_df.shape)
print(labels_df["Label"].value_counts())
labels_df.head()

Labels shape: (165, 1)
Label
1    147
0     18
Name: count, dtype: int64


Unnamed: 0_level_0,Label
Sample,Unnamed: 1_level_1
"""GSM152839""",0
"""GSM152840""",0
"""GSM152841""",0
"""GSM152842""",0
"""GSM152843""",0


In [55]:
expr_data = []
gene_ids = []
samples = []

with open("GSE6919-GPL93_series_matrix.txt", "r", encoding="utf-8", errors="ignore") as f:
    # 1️⃣ Extract sample IDs
    for line in f:
        if line.startswith("!Sample_geo_accession"):
            samples = line.strip().split("\t")[1:]
        elif line.startswith("!series_matrix_table_begin"):
            break

    # 2️⃣ Skip header row: ID_REF GSMxxxx GSMxxxx ...
    header = next(f)

    # 3️⃣ Read actual numeric data
    for line in f:
        if line.startswith("!series_matrix_table_end"):
            break
        parts = line.rstrip().split("\t")
        gene_ids.append(parts[0])
        expr_data.append(parts[1:])

expr_raw = pd.DataFrame(expr_data, index=gene_ids, columns=samples)

# 4️⃣ Safe numeric conversion
expr_raw = expr_raw.apply(pd.to_numeric, errors="coerce")

print("✅ Raw expression shape (probes x samples):", expr_raw.shape)
expr_raw.iloc[:5, :5]

✅ Raw expression shape (probes x samples): (12646, 165)


Unnamed: 0,"""GSM152839""","""GSM152840""","""GSM152841""","""GSM152842""","""GSM152843"""
"""48609_r_at""",3.8,5.6,7.7,75.9,47.8
"""48610_at""",3132.2,1015.7,1446.0,1889.9,1958.1
"""48612_at""",2589.3,3452.2,2657.7,2098.6,2007.6
"""48613_at""",39.2,312.6,234.5,190.3,55.8
"""48615_at""",593.3,481.7,777.5,174.4,390.9


In [56]:
common_samples = expr_raw.columns.intersection(labels_df.index)

print("Common samples:", len(common_samples))

expr_raw = expr_raw[common_samples]
labels_df = labels_df.loc[common_samples]

print("Aligned expression shape:", expr_raw.shape)
print("Aligned labels shape:", labels_df.shape)

Common samples: 165
Aligned expression shape: (12646, 165)
Aligned labels shape: (165, 1)


In [62]:
import pandas as pd
from io import StringIO

# ---- STEP 1: Read file as raw text ----
with open("GPL93.annot.txt", "r", encoding="utf-8", errors="ignore") as f:
    lines = f.readlines()

# ---- STEP 2: Find where the platform table begins ----
start_idx = None
for i, line in enumerate(lines):
    if line.startswith("!platform_table_begin"):
        start_idx = i + 1
        break

assert start_idx is not None, "platform_table_begin not found!"

# ---- STEP 3: Load only the real table ----
annot_df = pd.read_csv(
    StringIO("".join(lines[start_idx:])),
    sep="\t"
)

print("Annotation table shape:", annot_df.shape)
print("Annotation columns:")
print(annot_df.columns.tolist())

Annotation table shape: (12647, 21)
Annotation columns:
['ID', 'Gene title', 'Gene symbol', 'Gene ID', 'UniGene title', 'UniGene symbol', 'UniGene ID', 'Nucleotide Title', 'GI', 'GenBank Accession', 'Platform_CLONEID', 'Platform_ORF', 'Platform_SPOTID', 'Chromosome location', 'Chromosome annotation', 'GO:Function', 'GO:Process', 'GO:Component', 'GO:Function ID', 'GO:Process ID', 'GO:Component ID']


In [72]:
import pandas as pd

expr_data = []
gene_ids = []

with open("GSE6919-GPL93_series_matrix.txt", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        if line.startswith("!series_matrix_table_begin"):
            break

    for line in f:
        if line.startswith("!series_matrix_table_end"):
            break

        parts = line.strip().split("\t")
        gene_ids.append(parts[0])
        expr_data.append(parts[1:])

expr_raw = pd.DataFrame(expr_data, index=gene_ids)

In [73]:
with open("GSE6919-GPL93_series_matrix.txt", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        if line.startswith("!Sample_geo_accession"):
            samples = line.strip().split("\t")[1:]
            break

expr_raw.columns = samples

In [74]:
expr_raw = expr_raw.apply(pd.to_numeric, errors="coerce")

In [75]:
print("expr_raw shape:", expr_raw.shape)
print("Example probes:", expr_raw.index[:5].tolist())
print("Example samples:", expr_raw.columns[:5].tolist())

expr_raw shape: (12647, 165)
Example probes: ['"ID_REF"', '"48609_r_at"', '"48610_at"', '"48612_at"', '"48613_at"']
Example samples: ['"GSM152839"', '"GSM152840"', '"GSM152841"', '"GSM152842"', '"GSM152843"']


In [77]:
# Remove ID_REF row if present
expr_raw = expr_raw[expr_raw.index != "ID_REF"]

print("After removing ID_REF:")
print("expr_raw shape:", expr_raw.shape)
print("Example probes:", expr_raw.index[:5].tolist())

After removing ID_REF:
expr_raw shape: (12647, 166)
Example probes: ['"ID_REF"', '"48609_r_at"', '"48610_at"', '"48612_at"', '"48613_at"']


In [78]:
# Rename annotation columns once
annot_df = annot_df.rename(columns={"ID": "Probe", "Gene symbol": "Gene"})

# Clean
expr_raw.index = expr_raw.index.astype(str).str.strip()
annot_df["Probe"] = annot_df["Probe"].astype(str).str.strip()
annot_df["Gene"] = annot_df["Gene"].astype(str).str.strip()

# Map probes → genes
probe_to_gene = dict(zip(annot_df["Probe"], annot_df["Gene"]))
expr_raw["Gene"] = expr_raw.index.map(probe_to_gene)

print("Mapped genes:", expr_raw["Gene"].notna().sum())
print("Total probes:", expr_raw.shape[0])

Mapped genes: 0
Total probes: 12647


In [79]:
# 1. Move index to a column so we can fix it cleanly
expr_raw = expr_raw.reset_index()

print(expr_raw.head())

          index  "GSM152839"  "GSM152840"  "GSM152841"  "GSM152842"  ...  "GSM187532"  "GSM187533"  "GSM187534"  \
0      "ID_REF"          NaN          NaN          NaN          NaN  ...          NaN          NaN          NaN   
1  "48609_r_at"          3.8          5.6          7.7         75.9  ...         13.7          6.1         24.1   
2    "48610_at"       3132.2       1015.7       1446.0       1889.9  ...       2040.0       2152.9       1678.2   
3    "48612_at"       2589.3       3452.2       2657.7       2098.6  ...       3533.8       3063.6       3163.0   
4    "48613_at"         39.2        312.6        234.5        190.3  ...         16.0        105.8         10.2   

   "GSM187535"  Gene  
0          NaN   NaN  
1          6.1   NaN  
2       2762.5   NaN  
3       3139.7   NaN  
4        190.1   NaN  

[5 rows x 167 columns]


In [85]:
# Clean probe IDs in index
expr_raw.index = (
    expr_raw.index
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
)

# Remove ID_REF row
expr_raw = expr_raw[expr_raw.index != "ID_REF"]

print("expr_raw shape:", expr_raw.shape)
print("Example probes:", expr_raw.index[:5].tolist())

expr_raw shape: (12646, 166)
Example probes: ['48609_r_at', '48610_at', '48612_at', '48613_at', '48615_at']


In [86]:
# Remove old/broken Gene column if it exists
if "Gene" in expr_raw.columns:
    expr_raw = expr_raw.drop(columns=["Gene"])

print("Columns after cleanup:", expr_raw.columns[-5:])

Columns after cleanup: Index(['"GSM153295"', '"GSM187532"', '"GSM187533"', '"GSM187534"', '"GSM187535"'], dtype='object')


In [87]:
# Rename annotation columns ONCE
annot_df = annot_df.rename(columns={
    "ID": "Probe",
    "Gene symbol": "Gene"
})

# Clean annotation fields
annot_df["Probe"] = (
    annot_df["Probe"]
    .astype(str)
    .str.replace('"', '', regex=False)
    .str.strip()
)

annot_df["Gene"] = (
    annot_df["Gene"]
    .astype(str)
    .str.strip()
)

In [88]:
# Build mapping dictionary
probe_to_gene = dict(zip(annot_df["Probe"], annot_df["Gene"]))

# Map probes to genes
expr_raw["Gene"] = expr_raw.index.map(probe_to_gene)

print("Mapped genes:", expr_raw["Gene"].notna().sum())
print("Total probes:", expr_raw.shape[0])

Mapped genes: 7540
Total probes: 12646


In [89]:
expr_gene = (
    expr_raw
    .dropna(subset=["Gene"])
    .drop(columns=["Gene"])
    .groupby(expr_raw["Gene"])
    .mean()
)

print("Gene-level expression shape:", expr_gene.shape)
expr_gene.head()

Gene-level expression shape: (5642, 165)


Unnamed: 0_level_0,"""GSM152839""","""GSM152840""","""GSM152841""","""GSM152842""","""GSM152843""",...,"""GSM153295""","""GSM187532""","""GSM187533""","""GSM187534""","""GSM187535"""
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A1CF,13.35,72.45,11.15,114.1,17.2,...,32.95,139.3,32.95,29.8,3.65
AAAS,386.7,429.2,506.1,572.4,453.7,...,790.2,92.6,609.3,558.0,465.7
AACS,485.333333,326.2,341.0,279.666667,425.8,...,397.433333,421.666667,427.333333,478.766667,723.533333
AADAT,64.5,195.65,107.8,101.7,110.65,...,143.0,26.7,17.75,62.05,102.75
AAK1,29.7,47.8,9.3,46.1,19.5,...,80.2,10.7,14.7,4.3,4.7


In [90]:
# Save final gene-level matrix
expr_gene.to_csv("gene_expression_matrix.csv")

# Save labels aligned to samples
labels_df.to_csv("labels_final.csv")

print("Saved:")
print("- gene_expression_matrix.csv")
print("- labels_final.csv")

Saved:
- gene_expression_matrix.csv
- labels_final.csv
