In [4]:
# =========================================================
# Assignment 2 â€“ Yeast Transcript Half-Life Analysis
# Fully Updated & Colab-Compatible Version
# =========================================================

import pandas as pd
import numpy as np
from scipy.stats import linregress
import matplotlib.pyplot as plt
from google.colab import files

# ---------------------------------------------------------
# STEP 1: Upload File (Safer for Colab)
# ---------------------------------------------------------
print("Please upload DecayTimecourse.txt")
uploaded = files.upload()

filename = list(uploaded.keys())[0]
data = pd.read_csv(filename, sep="\t")

# ---------------------------------------------------------
# STEP 2: Clean Data
# ---------------------------------------------------------
data = data.dropna(how="all")
data.rename(columns={data.columns[0]: "Transcript"}, inplace=True)
data.set_index("Transcript", inplace=True)
data = data.apply(pd.to_numeric, errors="coerce")
data = data.dropna(how="all")

print(f"Total transcripts after cleaning: {len(data)}")

# ---------------------------------------------------------
# STEP 3: Define Time Points
# ---------------------------------------------------------
time_points = np.array([0,5,10,15,20,30,40,50,60])

replicates = [
    data.iloc[:, 0:9].values,
    data.iloc[:, 9:18].values,
    data.iloc[:, 18:27].values
]

transcripts = data.index

# ---------------------------------------------------------
# STEP 4: Half-Life Estimation Function
# ---------------------------------------------------------
def compute_half_life(values):

    mask = ~np.isnan(values)
    if mask.sum() < 3:
        return np.nan

    t = time_points[mask]
    expr = values[mask]

    expr = np.where(expr <= 0, np.nan, expr)
    mask = ~np.isnan(expr)

    if mask.sum() < 3:
        return np.nan

    t = t[mask]
    expr = expr[mask]

    log_expr = np.log(expr)

    slope, intercept, r, p, stderr = linregress(t, log_expr)

    if slope >= 0:
        return np.nan

    decay_constant = -slope
    return np.log(2) / decay_constant

# ---------------------------------------------------------
# STEP 5: Calculate Half-Lives
# ---------------------------------------------------------
results = []

for i in range(len(transcripts)):

    replicate_half_lives = []

    for rep in replicates:
        hl = compute_half_life(rep[i])
        if not np.isnan(hl):
            replicate_half_lives.append(hl)

    if len(replicate_half_lives) > 0:
        avg_hl = np.mean(replicate_half_lives)
        results.append([transcripts[i], avg_hl])

half_life_df = pd.DataFrame(
    results,
    columns=["Transcript", "Average_HalfLife"]
).set_index("Transcript")

print(f"Genes with valid half-life: {len(half_life_df)}")

# ---------------------------------------------------------
# STEP 6: Identify Top & Bottom 10%
# ---------------------------------------------------------
top_cutoff = half_life_df["Average_HalfLife"].quantile(0.9)
bottom_cutoff = half_life_df["Average_HalfLife"].quantile(0.1)

high_stability = half_life_df[
    half_life_df["Average_HalfLife"] >= top_cutoff
]

low_stability = half_life_df[
    half_life_df["Average_HalfLife"] <= bottom_cutoff
]

print(f"Top 10% genes: {len(high_stability)}")
print(f"Bottom 10% genes: {len(low_stability)}")

# ---------------------------------------------------------
# STEP 7: Save Output Files
# ---------------------------------------------------------
half_life_df.to_csv("All_HalfLives.csv")
high_stability.to_csv("high_half_life_genes_detailed_results.csv")
low_stability.to_csv("low_half_life_genes_detailed_results.csv")

high_stability.index.to_series().to_csv(
    "high_half_life_genes.txt", index=False, header=False
)

low_stability.index.to_series().to_csv(
    "low_half_life_genes.txt", index=False, header=False
)

summary_stats = half_life_df["Average_HalfLife"].describe()
summary_stats.to_csv("half_life_overview_statistics.csv")

# ---------------------------------------------------------
# STEP 8: Save Distribution Plot
# ---------------------------------------------------------
plt.figure(figsize=(8,5))
plt.hist(half_life_df["Average_HalfLife"], bins=50)
plt.xlabel("Half-Life (minutes)")
plt.ylabel("Frequency")
plt.title("Distribution of Yeast Transcript Half-Lives")
plt.tight_layout()
plt.savefig("half_life_distribution_overview.png")
plt.close()

print("All output files saved successfully.")

# ---------------------------------------------------------
# STEP 9: Display Summary Statistics
# ---------------------------------------------------------
print("\nHalf-Life Summary:")
print(summary_stats)

Please upload DecayTimecourse.txt


Saving DecayTimecourse.txt to DecayTimecourse.txt
Total transcripts after cleaning: 6176
Genes with valid half-life: 6133
Top 10% genes: 614
Bottom 10% genes: 614
All output files saved successfully.

Half-Life Summary:
count     6133.000000
mean        59.459527
std        302.810838
min          3.466163
25%         21.966658
50%         30.648544
75%         46.542721
max      15670.996989
Name: Average_HalfLife, dtype: float64
