<a href="https://colab.research.google.com/github/zuzanadostalova/Tutorials/blob/master/CBED_tool_intervals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Compressed code for the chromosome lengths distribution plot.

In [0]:
# Do not forget to import corresponding packages you are going to need in each step of the data preparation.
import pandas as pd

# The dataframe is created from csv file which you are reading with pandas tool. You need to unzip the source BED file, separate the chromosomes from one another using tab delimiter as separator, and name the ten columns of the chromosomes.
dataframe = pd.read_csv('https://www.encodeproject.org/files/ENCFF861KMV/@@download/ENCFF861KMV.bed.gz', compression='gzip', sep="\t", names=['chrom', 'start', 'end', 'name', 'score', 'strand', '7', '8', '9', '10'])

# It is necessary need to define the intervals by subtracting the start from the end in the dataframe.
dataframe['intervals'] = dataframe.end - dataframe.start
print(dataframe)

import matplotlib.pyplot as plt
import seaborn as sns

# Here you can set the size of the graph.
plt.figure(figsize=(50,10))

# Now you have the dataframe with separated chromosomes and their intervals which are required for plotting the graph prepared.
# The x axis is going to be the name of the chromosome, the y axis is going to be the interval of the respective chromosomes. Data is the dataframe you prepared, cut is changing the range of the y axis, width is changing the width of the particular violin plots.
# Palette enables us to change the color of the graph.
sns.violinplot(x="chrom", y="intervals", data=dataframe, cut = 1, width = 1.5, palette="pastel")

from pylab import savefig
from matplotlib.pyplot import show

# To save the plot, you need to create a variable, save this variable as a jpeg file, and to visualize it again, use function show and the jpeg file.
violin = sns.violinplot(x="chrom", y="intervals", data=dataframe, cut = 1, width = 1.5, palette="pastel")
violin.figure.savefig("chromviolin.jpg")
show("chromviolin.jpg")

#Distribution of chromosome lengths


I.	Distribution plot of chromosome lengths


A.	Get the BED file

In [0]:
# Send commands to system (linux)
# Download from URL with wget
! wget https://www.encodeproject.org/files/ENCFF861KMV/@@download/ENCFF861KMV.bed.gz
# Gunzip to uncompress the archive
! gunzip ENCFF861KMV.bed.gz

Var. I.: Distribution Plot The Hard Way

In [0]:
# Unzipped input BED file
input_file = "ENCFF861KMV.bed"

B.	Define the compounds of the dictionary

In [0]:
# Empty dictionary which is going to be filled with BED file data.
dict_dist = {}

# Open the BED file, read it, separate its lines into columns behind the tab delimeter.
# Define the chromosome name, start, end, and interval columns.
with open(input_file, 'r') as text:
  for line in text.readlines():
    column = line.strip().split("\t")
    chrom_name = column[0]
    start = int(column[1])
    end = int(column[2])
    interval = end - start

    # If the chromosome name is already in the dictionary, add the interval inside - this 
    # way you avoid duplitious chromosome names.
    if chrom_name in dict_dist:
      dict_dist[chrom_name].append(interval)

    else:
      # Define the structure of the empty dictionary first. If the chromosome name 
      # is not yet in the dictionary, it is going to be created in this step.
      # Subsequently, the corresponding intervals are going to be added inside
      # the chromosome name.
      dict_dist[chrom_name] = [] 
      dict_dist[chrom_name].append(interval) 
      # Note: dict_dist = {chrom_name[interval:count]} - the occurrence of the intervals is counted automatically


C. Print out

In [0]:
print(dict_dist) # Print out key value pairs of chromosomes with corrresponding intervals.

for chrom_name in dict_dist: # Iterate over the list in the dictionary.
  print(chrom_name) # Print out the individual chromosomes.
  print(dict_dist[chrom_name]) # Print out the list of intervals.

D. Distribution plot - histogram

In [0]:
# Seaborn - four distribution subplots.
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# 2 columns, 2 rows = 4 plots. All of them share the same x axis which
# means they have the same scales.
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)

for index, chromosome in enumerate(dict_dist):
    # Create dataframe the original dictionary.
    test=pd.Series(data=dict_dist[chromosome])

    # Each of the graphs represents one of the planes. It is a locations of
    # pixels in the two-dimensional grid.
    if index == 0:
      sns.distplot( test , color="skyblue", ax=axes[0, 0])
    elif index == 1:
      sns.distplot( test , color="skyblue", ax=axes[1, 0])
    elif index == 2:
      sns.distplot( test , color="skyblue", ax=axes[0, 1])
    else:
      sns.distplot( test , color="skyblue", ax=axes[1, 1])


In [0]:
from .bed_read import read_bed


In [0]:
# Catplot
import pandas as pd
plt.figure(figsize=(20,5))
df_list = []

for chrom_name, value in dict_dist.items():
  df_series = pd.Series(value)
  df = pd.DataFrame(df_series, columns=["intervals"])
  df["chrom"] = chrom_name
  df_list.append(df)
  final_df = pd.concat(df_list, axis=0)
  f = final_df[final_df.chrom == "chr9"].transpose()

sns.catplot(x="chrom", y="intervals", data=final_df, palette="Set1", height=5, aspect=7)

catplotI = sns.catplot(x="chrom", y="intervals", data=final_df, palette="Set1", height=5, aspect=7)
catplot.savefig("catplotI.jpg")
show("catplotI.jpg")

In [0]:
# Boxplot
import pandas as pd
plt.figure(figsize=(45,5))
df_list = []

for chrom_name, value in dict_dist.items():
  df_series = pd.Series(value)
  df = pd.DataFrame(df_series, columns=["intervals"])
  df["chrom"] = chrom_name
  df_list.append(df)
  final_df = pd.concat(df_list, axis=0)
  f = final_df[final_df.chrom == "chr9"].transpose()
 
sns.boxplot(x="chrom", y="intervals", data=final_df, palette="Set1")
boxplotI = sns.boxplot(x="chrom", y="intervals", data=final_df, palette="Set1")
boxplotI.savefig("catplotI.jpg")
show("boxplotI.jpg")

Var. II: Distribution Plot the Simple Way:


Plot created straight from a dataframe which originated from unzipped BED file.

In [0]:
# Load BED file as a pandas dataframe
input_file = "ENCFF861KMV.bed"
from matplotlib.pyplot import show
from pylab import savefig
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
plt.figure(figsize=(50,10))

df_list = []


# Load BED file as Pandas df
# Create a bed_df variable and save input file inside seperated by tab delimeter. Provide name for each of the columns - for our purposes the most important are "chrom", "start", and "end".
bed_df = pd.read_csv(input_file, sep='\t', names=['chrom', 'start', 'end', 'name', 'score', 'strand', '7', '8', '9', '10'])

# You need to define intervals inside the variable bed_df in the next step. 
bed_df['intervals'] = bed_df.end - bed_df.start

# Create the violin plot with the chromosome intervals distribution.
sns.violinplot(x="chrom", y="intervals", data=bed_df, cut = 1, width = 1.5, palette="Set1")

# To save the plot, you need to assign it to a variable. 
violin = sns.violinplot(x="chrom", y="intervals", data=bed_df, cut = 1, width = 1.5, palette="Set1")
violin.figure.savefig("chromviolin.jpg")

# If you need to display the saved plot, use the function show.
show("chromviolin.jpg")

In [0]:
# Load BED file as a pandas dataframe
input_file = "ENCFF861KMV.bed"
import pandas as pd
from matplotlib.pyplot import show
from pylab import savefig
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
df_list = []

for chrom_name, value in dict_dist.items():
  df_series = pd.Series(value)
  df = pd.DataFrame(df_series, columns=["intervals"])
  df["chrom"] = chrom_name
  df_list.append(df)
  final_df = pd.concat(df_list, axis=0)
  f = final_df[final_df.chrom == "chr9"].transpose()

#sns.catplot(x="chrom", y="intervals", data=final_df, palette="Set1", kind='box', height=5, aspect=7)

catplot = sns.catplot(x="chrom", y="intervals", data=final_df, palette="Set1", kind='box', height=5, aspect=7)
catplot.savefig("catplot.jpg")
show("catplot.jpg")

In [0]:
# Load BED file as a pandas dataframe
input_file = "ENCFF861KMV.bed"
import pandas as pd
from matplotlib import rcParams
rcParams['figure.figsize'] = 20,15
df_list = []

for chrom_name, value in dict_dist.items():
  df_series = pd.Series(value)
  df = pd.DataFrame(df_series, columns=["intervals"])
  df["chrom"] = chrom_name
  df_list.append(df)
  final_df = pd.concat(df_list, axis=0)
  f = final_df[final_df.chrom == "chr9"].transpose()
  sns.swarmplot(x="chrom", y="intervals", data=final_df, palette="Set1")