# Co-Expression Analysis

--> Add a description HERE!

## Import libraries

In [None]:
import PyWGCNA

import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession

## Setup spark session

In [None]:
spark = (
    SparkSession.builder
    .config("spark.driver.memory", "20g")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .appName("single-cell-reciter-job")
    .master("local[*]")
    .getOrCreate()
)

In [None]:
spark

## Load the data

In [None]:
data = spark.read.parquet('./Results/parquets/rank_by_topic')

In [None]:
data.show()

In [None]:
data.count()

In [None]:
topic_list = []
col_names = [x[0] for x in data.select('topic_0').toLocalIterator()]

schema = T.StructType(
    [T.StructField(name, T.DoubleType(), False) for name in col_names]
    )

df_base = spark.createDataFrame([], schema=schema)

In [None]:
for i in range(10):
    topic_list.append(f'topic_{i}')
    col_list = [col for col in data.columns if f'{i}' in col]
    #col_names = [x[0] for x in data.select(col_list[0]).toLocalIterator()]
    #values = [x[0] for x in data.select(col_list[1]).toLocalIterator()]
    df_base = df_base.unionByName(spark.createDataFrame([[x[0] for x in data.select(col_list[1]).toLocalIterator()],], [x[0] for x in data.select(col_list[0]).toLocalIterator()]))

In [None]:
df_base.write.parquet("../results/base.parquet")

In [None]:
df_base.count()

In [None]:
len(df_base.columns)

## Analysis

### Load the data

In [None]:
data = spark.read.parquet("../results/base.parquet")

In [None]:
pd_df= data.toPandas()

In [None]:
transposed_data = pd_df.T

In [None]:
transposed_data = transposed_data.reset_index()

### Create a PyWGCNA object

In [None]:
# Not necessary
expression_matrix = PyWGCNA.geneExp.GeneExp(species='human', level='transcript', geneExp=transposed_data)

In [None]:
# create a PyWGCNA object
scc = PyWGCNA.WGCNA(name='SCC', species='human', geneExp=transposed_data, save=True, TPMcutoff=1)

In [None]:
scc.geneExpr.to_df().head(5)

### Pre-processing

In [None]:
scc.preprocess()

With TPMcutoff = 1 (default):

Output:
Pre-processing...
	Detecting genes and samples with too many missing values...
An exception has occurred, use %tb to see the full traceback.

SystemExit: Too few genes with valid expression levels in the required number of samples.

With TPMcutoff = 0 : 

ValueError: Image size of 138900x1000 pixels is too large. It must be less than 2^16 in each direction.

--> Set save=False

--> Same error...

### Gene network

In [None]:
scc.findModules()