In [1]:
import pandas as pd
import sys 

In [2]:
def count_pos_kmer(Seq,k):
    """
    Summary : The function Counts the kmers of size k, where k is specified as an argument.
    
    description: This function takes k and a sequence Seq as input arguments and determines the number of possible/expected k-mers 
    for a particular k-value
    
    Parameters: 
    Seq: the input sequence for which the number of k-mers need to be determined
    k: the input value, ranges from 1 to length of the sequence 
    
    Return:
    pos_kmers: the number of kmers possible
    """
    
    lk_pos = []
    if k == 1:
        return 4;
    else:
        for i in range(0,len(Seq)-k+1):
            lk_pos.append(Seq[i:i+k])
            
        return len(lk_pos)

In [4]:
def create_kmer_df(Seq):
    """
    Summary: Creates a data frame with k values, associated
    number of observed number of kmers and all possible possible number of kmers as columns. 
    
    Description: This function takes a sequence Seq as input argument and returns a pandas data frame 
    which has columns k value, observed kmer count and possible kmer count.
    
    Parameters: 
    Seq: the input sequence for which the kmer data frame needs to be created
    
    Return:
    kmer_df: kmer data drame
    """
    #import pandas as pd
    k = []
    k_pos_list = []
    k_obs_list = []
    k = list(range(1,len(Seq)+1))
    
    for i in k:
        k_pos = count_pos_kmer(Seq,i)
        k_pos_list.append(k_pos)
    
    for i in k:
        k_obs = count_obs_kmer(Seq,i)
        k_obs_list.append(k_obs)
    
    kmer_df = pd.DataFrame(     # creating pandas the data frame
    {
         'k':k,
         'Observed kmers':k_obs_list,
         'Possible kmers':k_pos_list
    }
    )
    return kmer_df

In [6]:
def linguistic_complexity(Seq):
    """
    Summary: Calculates the lingusitic complexity of a given sequence
    
    Description: This function takes a sequence Seq as input argument and produces the linguistic complexity, 
    the proportion of k-mers that are observed compared to the total number that are theoretically possible
        
    Parameters: 
    Seq: the input sequence for which the linguistic complexity needs to be determined
    
    Return:
    lc: linguistic complexity
    """
    
    kmer_df = create_kmer_df(Seq)
    tot_obs_kmer = sum(kmer_df['Observed kmers'])
    tot_pos_kmer = sum(kmer_df['Possible kmers'])
    lc = tot_obs_kmer/tot_pos_kmer
    return lc

In [None]:
if __name__=='__main__':
    
    myfile = sys.argv[1]
    with open(myfile,'r') as current_file:
        text = current_file.read()
    seq = text.split() # split sequences
    for i in range(0,len(seq)):
        #print(seq[i])
        lc_seq = linguistic_complexity(seq[i])
        print(lc_seq) # print the linguistic complexity
        #plot_kmer_prop(seq[i]) # produce the plot for kmer proportion