In [22]:
import pandas as pd
from scipy.stats import pearsonr
import plotly.graph_objects as go

In [20]:
df = pd.read_hdf("/home/wdecoster/wsl-repos/pathSTR-1000G/1000G.pathSTRdb", key="details")
df = df[["dataset", "gene", "length"]]
df.columns = ["_".join(col).strip() for col in df.columns.values]
df.reset_index(inplace=True)
df["identifier"] = df["sample"] + "_" + df["gene_"]
df = df.pivot(
    index="identifier", columns="dataset_", values=["length_Allele1", "length_Allele2"]
).fillna(0)
df.columns = [
    "_".join(col).strip().replace("_hg38", "").replace("length_", "")
    for col in df.columns.values
]
df["LongTR_long_allele"] = df[["Allele1_LongTR", "Allele2_LongTR"]].max(axis=1)
df["LongTR_short_allele"] = df[["Allele1_LongTR", "Allele2_LongTR"]].min(axis=1)
df["STRdust_long_allele"] = df[["Allele1_STRdust", "Allele2_STRdust"]].max(axis=1)
df["STRdust_short_allele"] = df[["Allele1_STRdust", "Allele2_STRdust"]].min(axis=1)

In [27]:
stat_long, p_long = pearsonr(df["LongTR_long_allele"], df["STRdust_long_allele"])
stat_short, p_short = pearsonr(df["LongTR_short_allele"], df["STRdust_short_allele"])
stat_both, p_both = pearsonr(df["LongTR_long_allele"] + df["LongTR_short_allele"], df["STRdust_long_allele"] + df["STRdust_short_allele"])

In [28]:
print(stat_long, p_long)
print(stat_short, p_short)
print(stat_both, p_both)

0.8193898600472165 0.0
0.5906233130787318 0.0
0.7966870309189044 0.0


In [38]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df["LongTR_long_allele"], y=df["STRdust_long_allele"], marker_color="blue", mode="markers", name="Long allele"))
fig.add_trace(go.Scatter(x=df["LongTR_short_allele"], y=df["STRdust_short_allele"], marker_color="red", mode="markers", name="Short allele"))
fig.update_traces(marker_size=2)
fig.update_layout(
    title="LongTR vs. STRdust",
    xaxis_title="LongTR",
    yaxis_title="STRdust",
    height=800,
    width=800,
    plot_bgcolor="rgba(0, 0, 0, 0)",
    paper_bgcolor="rgba(0, 0, 0, 0)",
    font=dict(size=16),
    legend=dict(yanchor="top", y=0.95, xanchor="left", x=0.01),
)
fig.update_xaxes(showline=True, linewidth=2, linecolor="black", mirror=True, showgrid=False, range=[0, 2000])
fig.update_yaxes(showline=True, linewidth=2, linecolor="black", mirror=True, showgrid=False, range=[0, 2000])
fig.add_annotation(
    x=0.5,
    y=0.9,
    xref="paper",
    yref="paper",
    text=f"<b>Pearson correlation coefficient</b><br>- long allele: {stat_long:.2f}<br>- short allele: {stat_short:.2f}<br>- combined: {stat_both:.2f}",
    showarrow=False,
    align="left",
)
fig.show()