Upload the reference list and the raw data onto Google Colab workspace

In [0]:
!ls
from google.colab import files
files.upload()

Verify uploading

In [0]:
!ls

Convert CSV file for the ref list to dictionary

In [0]:
import csv

combi_code = {0:'AspGln', 1:'AspGlu', 2:'AsnGln', 3:'AsnGlu'}
# Specify the reference to work with
code = 0
ref_list_name = 'RefList_' + combi_code[code] + '.csv'

ref = {}

with open(ref_list_name, newline='') as csvfile:
  mycsv = csv.reader(csvfile)
  # Skip the header
  next(mycsv)
  for row in mycsv:
    new_key = row[1]
    new_value = float(row[2])
    ref[new_key] = new_value

print(len(ref))

Convert CSV file for the raw data to list

In [0]:
raw_data_name = 'anc.csv'

raw = []

with open(raw_data_name, newline='') as csvfile:
  mycsv = csv.reader(csvfile)
  # Skip the header
  next(mycsv)
  for row in mycsv:
    new_value = float(row[0])
    raw.append(new_value)

print(len(raw))

Implement the analysis

In [0]:
# Set threshold as needed
thres_ppm = 50
thres = thres_ppm/1000000
# Structure of item analysis: {str:[hit counter, index, index...]}
analysis = {}

# Iterate through all elements in raw data
for i in range(len(raw)):
  curr = raw[i]
  # Print progress every 1000 elements
  if i%1000==0:
    print(i)
  # Compare the current elements with all elements in the reference
  for key, value in ref.items():
    # If the difference is less than threshold
    if abs((curr-value)/value)<=thres:
      # If the key has never been recorded, record it
      # Also record the index in original data
      if key not in analysis.keys():
        analysis[key] = [1]
        analysis[key].append(i)
      # If the key has been recorder, update the counter
      # Also record the index in original data
      else:
        analysis[key][0] += 1
        analysis[key].append(i)
    # If the difference is more than threshold, compare the next
    else:
      continue

print(analysis)
print('Length: ', len(analysis))

# Calculate total hit count
total_hit = 0
for value in analysis.values():
  total_hit = total_hit + value[0]
print('Total hit:', total_hit)

Export the result as a csv file

In [0]:
root_name = raw_data_name.split('.')[0]

# Output the hit analysis
output_file_name = root_name + '_' + combi_code[code] + '_analysis.csv'
with open(output_file_name, 'w') as f:
  f.write("Index, Property, Hit count, Original index\n")
  i = 1
  for key in analysis.keys():
    # Write index, charateristic string and hit count
    f.write("%d,%s,%s"%(i, key, analysis[key][0]))
    j = 1
    # Write all hit index
    while j<len(analysis[key]):
      f.write(",%s"%analysis[key][j])
      j += 1
    f.write("\n")
    i += 1



Download the output file

In [0]:
!ls
files.download(output_file_name)