In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
import pickle

In [2]:
raw_df = pd.read_csv('./venom.tsv', sep='\t')

In [4]:
raw_df['Protein families'].head()

0                             Conotoxin I2 superfamily
1                                                  NaN
2    Venom metalloproteinase (M12B) family, P-II su...
3    Venom metalloproteinase (M12B) family, P-II su...
4    Venom metalloproteinase (M12B) family, P-II su...
Name: Protein families, dtype: object

In [6]:
len(raw_df['Protein families'].unique())

608

In [25]:
families_raw = list(raw_df['Protein families'].unique())
families_raw

['Conotoxin I2 superfamily',
 nan,
 'Venom metalloproteinase (M12B) family, P-II subfamily, P-IIa sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-II subfamily, P-IIe sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-II subfamily',
 'Venom metalloproteinase (M12B) family, P-III subfamily',
 'Venom metalloproteinase (M12B) family, P-III subfamily, P-IIIb sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-II subfamily, P-IId sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-III subfamily, P-IIIa sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-III subfamily, P-IIIc sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-II subfamily, P-IIb sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-II subfamily, P-IIc sub-subfamily',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family, P-III subfamily, P-IIId sub-subfamily',
 'Venom metalloproteinase (M12B) family, P-I subfamily',
 'Limacoditoxin-1 (ACP-like) f

In [33]:
families_raw[1] = 'No'

In [52]:
families_list = []
for record in families_raw:
        family = record.split(' family')[0] + ' family'
        families_list.append(family)

families_list

['Conotoxin I2 superfamily family',
 'No family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Venom metalloproteinase (M12B) family',
 'Limacoditoxin-1 (ACP-like) family',
 'Xibalbin-2 family',
 'Xibalbin-13 family',
 'Xibalbin-1 family',
 'Three-finger toxin family',
 'Three-finger toxin family',
 'Three-finger toxin family',
 'Three-finger toxin family',
 'Three-finger toxin family',
 'Bradykinin-related peptide family',
 'Short scorpion toxin superfamily, Potassium channel inhibitor kappa-KTx family',
 'Venom waprin family',
 'Ve

In [53]:
families_set = set(families_list)
len(families_set)

308

In [54]:
families_set

{"5'-nucleotidase family",
 'AB hydrolase superfamily, Lipase family',
 'ATP:guanido phosphotransferase family',
 'AVIT (prokineticin) family',
 'Acrorhagin I family',
 'Actinoporin family',
 'Ant myrmeciitoxin-01 family',
 'Ant venom allergen 2/4 family',
 'Arthropod CHH/MIH/GIH/VIH hormone family',
 'Arthropod phospholipase D family',
 'Asilidin-1 family',
 'Asilidin-12 family',
 'Beta-defensin family',
 'Bradykinin inhibitor peptide family',
 'Bradykinin-potentiating peptide family',
 'Bradykinin-related peptide family',
 'CART family',
 'CREC family',
 'CRISP family',
 'Calmodulin family',
 'Caterpillar 1 family',
 'Caterpillar 11 family',
 'Caterpillar 2 family',
 'Caterpillar 3 family',
 'Caterpillar 4 family',
 'Caterpillar 8 family',
 'Caterpillar 9 family',
 'Cathelicidin family',
 'Cationic peptide 01 (latrotoxin) family',
 'Cationic peptide 02 (oxyopinin-2) family',
 'Cationic peptide 03 (latarcin) family',
 'Cationic peptide 04 (cupiennin) family',
 'Cationic peptide 06 (cy

In [55]:
test_df = raw_df.sample(20).copy()
test_df

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Organism,Length,Protein families,Taxonomic lineage
3531,D9U2B1,reviewed,KBX22_LYCMC,Neurotoxin beta-KTx 12,Lychas mucronatus (Chinese swimming scorpion),89,"Long chain scorpion toxin family, Class 2 subf...","cellular organisms (no rank), Eukaryota (domai..."
6943,O57385,reviewed,PA2H_DEIAC,Basic phospholipase A2 homolog acutohaemolysin...,Deinagkistrodon acutus (Hundred-pace snake) (A...,138,"Phospholipase A2 family, Group II subfamily, K...","cellular organisms (no rank), Eukaryota (domai..."
3811,P0DUK7,reviewed,3SOF1_NAJAT,Mu-elapitoxin-Na1a (Mu-EPTX-Na1a),Naja atra (Chinese cobra),62,"Three-finger toxin family, Short-chain subfami...","cellular organisms (no rank), Eukaryota (domai..."
4714,P58305,reviewed,TXGRX_BUNCN,Granulitoxin (GRX),Bunodosoma cangicum (Sea anemone),21,,"cellular organisms (no rank), Eukaryota (domai..."
1089,P0CC19,reviewed,TXL1_LASSB,U1-theraphotoxin-Lsp1a (U1-TRTX-Lsp1a) (LTx1),Lasiodora sp. (strain IBSP 8539) (Brazilian sa...,99,"Neurotoxin 12 (Hwtx-2) family, 04 (lasiotoxin)...","cellular organisms (no rank), Eukaryota (domai..."
4777,P0DX32,reviewed,MAST2_EUMMI,Eumenine mastoparan-EM2 (EMP-EM2),Eumenes micado (Potter wasp),14,"MCD family, Mastoparan subfamily","cellular organisms (no rank), Eukaryota (domai..."
7321,P58328,reviewed,SCX4_OLIMR,Alpha-like toxin BmK-M4 (BmK4) (BmKM4) (Bmk M4...,Olivierus martensii (Manchurian scorpion) (Mes...,64,"Long (4 C-C) scorpion toxin superfamily, Sodiu...","cellular organisms (no rank), Eukaryota (domai..."
2904,P59886,reviewed,KAX38_HOTTS,Potassium channel toxin alpha-KTx 3.8 (Charybd...,Hottentotta tamulus sindicus (Scorpion) (Buthu...,38,"Short scorpion toxin superfamily, Potassium ch...","cellular organisms (no rank), Eukaryota (domai..."
1912,C0HLK3,reviewed,3S11_MICTC,Three-finger toxin Tschuditoxin-I,Micrurus tschudii (Desert coral snake),57,"Three-finger toxin family, Short-chain subfami...","cellular organisms (no rank), Eukaryota (domai..."
4177,B3EWI9,reviewed,OXLA_BOTMT,L-amino-acid oxidase (Bm-LAO) (LAAO) (EC 1.4.3.2),Bothrops mattogrossensis (Pitviper) (Bothrops ...,31,"Flavin monoamine oxidase family, FIG1 subfamily","cellular organisms (no rank), Eukaryota (domai..."


In [61]:
def simplify_family(df, fam_set):
    for family in fam_set:
        for index, row in df.iterrows():
            if family in str(row['Protein families']):
                df.at[index, 'Protein families'] = family

In [62]:
simplified_df = raw_df.copy()
simplify_family(simplified_df, families_set)

In [63]:
simplified_df.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Organism,Length,Protein families,Taxonomic lineage
0,P69499,reviewed,I2_CONST,conotoxin S11.3,Conus striatus (Striated cone),68,Conotoxin I2 superfamily,"cellular organisms (no rank), Eukaryota (domai..."
1,P0DUS0,reviewed,PRCT_TITSE,[des-Arg1]-proctolin (NEP-inhibiting peptide),Tityus serrulatus (Brazilian scorpion),4,,"cellular organisms (no rank), Eukaryota (domai..."
2,C0HLL1,reviewed,VM2CD_CRODU,Zinc metalloproteinase/disintegrin-like CdtV1 ...,Crotalus durissus terrificus (South American r...,32,Venom metalloproteinase (M12B) family,"cellular organisms (no rank), Eukaryota (domai..."
3,C0L2T8,reviewed,VM2C1_CRODO,Zinc metalloproteinase/disintegrin (Metallopro...,Crotalus durissus collilineatus (Brazilian rat...,478,Venom metalloproteinase (M12B) family,"cellular organisms (no rank), Eukaryota (domai..."
4,C9E1R9,reviewed,VM2V2_CROVV,Zinc metalloproteinase/disintegrin VMP-II [Cle...,Crotalus viridis viridis (Prairie rattlesnake),478,Venom metalloproteinase (M12B) family,"cellular organisms (no rank), Eukaryota (domai..."


In [66]:
pickle.dump(simplified_df, open('./pickles/simplified_data.pkl', 'wb'))

In [67]:
simplified_df = pickle.load(open('./pickles/simplified_data.pkl', 'rb'))
simplified_df.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Organism,Length,Protein families,Taxonomic lineage
0,P69499,reviewed,I2_CONST,conotoxin S11.3,Conus striatus (Striated cone),68,Conotoxin I2 superfamily,"cellular organisms (no rank), Eukaryota (domai..."
1,P0DUS0,reviewed,PRCT_TITSE,[des-Arg1]-proctolin (NEP-inhibiting peptide),Tityus serrulatus (Brazilian scorpion),4,,"cellular organisms (no rank), Eukaryota (domai..."
2,C0HLL1,reviewed,VM2CD_CRODU,Zinc metalloproteinase/disintegrin-like CdtV1 ...,Crotalus durissus terrificus (South American r...,32,Venom metalloproteinase (M12B) family,"cellular organisms (no rank), Eukaryota (domai..."
3,C0L2T8,reviewed,VM2C1_CRODO,Zinc metalloproteinase/disintegrin (Metallopro...,Crotalus durissus collilineatus (Brazilian rat...,478,Venom metalloproteinase (M12B) family,"cellular organisms (no rank), Eukaryota (domai..."
4,C9E1R9,reviewed,VM2V2_CROVV,Zinc metalloproteinase/disintegrin VMP-II [Cle...,Crotalus viridis viridis (Prairie rattlesnake),478,Venom metalloproteinase (M12B) family,"cellular organisms (no rank), Eukaryota (domai..."


In [74]:
simplified_df['Protein families'].isnull().sum()

np.int64(539)