In [1]:
import pandas as pd
import seaborn as sns
import time
import re
import nltk
import math
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
df = pd.read_csv('xtal_2.csv',header = None)
df.head(10)

Unnamed: 0,0,1,2
0,ATCG3D_18,DYKDDDDAMGQPGNGSAFLLAPNRSHAPDHDVTQQRDEVWVVGMGI...,1
1,BSGCAIR30348,MDKKYDITAVLNEDSSMTAISDQFQITLDARPKHTAKGFGPLAALL...,1
2,BSGCAIR30512,MTESFTRRERLRLRRDFLLIFKEGKSLQNEYFVVLFRKNGLDYSRL...,1
3,BSGCAIR30561,MEHDERTHVPVELRAAGVVLLNERGDILLVQEKGIPGHPEKAGLWH...,1
4,BSGCAIR30591,MFYKEENFKKTEIGEIPEDWEIVELKDVCKKIKAGGTPKTSVEEYY...,1
5,BSGCAIR30656,MAIRLYKLAVALGVFIVSAPAFSHGHHSHGKPLTEVEQKAANGVFD...,1
6,BSGCAIR31213,MKDIDTLISNNALWSKMLVEEDPGFFEKLAQAQKPRFLWIGCSDSR...,1
7,GO.102486,GSHMQRQRPPSRAGGDMDRLQSALALYEEAMGYTYAAALRAAAAVG...,1
8,GO.102706,LDQILRATVEEVRAFLGTDRVKVYRFDPEGHGTVVAEARGGERLPS...,1
9,GO.110986,GSPDPEIFRQRFRQFGYQDSPGPREAVSQLRELCRLWLRPETHTKE...,1


In [3]:
df.columns = ['id','sequence','result']
df.head()

Unnamed: 0,id,sequence,result
0,ATCG3D_18,DYKDDDDAMGQPGNGSAFLLAPNRSHAPDHDVTQQRDEVWVVGMGI...,1
1,BSGCAIR30348,MDKKYDITAVLNEDSSMTAISDQFQITLDARPKHTAKGFGPLAALL...,1
2,BSGCAIR30512,MTESFTRRERLRLRRDFLLIFKEGKSLQNEYFVVLFRKNGLDYSRL...,1
3,BSGCAIR30561,MEHDERTHVPVELRAAGVVLLNERGDILLVQEKGIPGHPEKAGLWH...,1
4,BSGCAIR30591,MFYKEENFKKTEIGEIPEDWEIVELKDVCKKIKAGGTPKTSVEEYY...,1


In [4]:
df.shape

(4791, 3)

In [5]:
df.dtypes

id          object
sequence    object
result       int64
dtype: object

In [6]:
# Count the number of 0's(failures) & 1's,(success) in the result column
df.groupby('result')['result'].value_counts()

result  result
0       0         3913
1       1          878
Name: result, dtype: int64

## Create small dataframe of equal number of proteins that crystallized & ones that did not crystallize.

In [7]:
crystals = df[df.result == 1]
crystals.head()

Unnamed: 0,id,sequence,result
0,ATCG3D_18,DYKDDDDAMGQPGNGSAFLLAPNRSHAPDHDVTQQRDEVWVVGMGI...,1
1,BSGCAIR30348,MDKKYDITAVLNEDSSMTAISDQFQITLDARPKHTAKGFGPLAALL...,1
2,BSGCAIR30512,MTESFTRRERLRLRRDFLLIFKEGKSLQNEYFVVLFRKNGLDYSRL...,1
3,BSGCAIR30561,MEHDERTHVPVELRAAGVVLLNERGDILLVQEKGIPGHPEKAGLWH...,1
4,BSGCAIR30591,MFYKEENFKKTEIGEIPEDWEIVELKDVCKKIKAGGTPKTSVEEYY...,1


In [8]:
crystals.shape

(878, 3)

In [9]:
no_xtal = df[df.result == 0]
no_xtal.head()

Unnamed: 0,id,sequence,result
426,APC7908,MNQHLLGNPKLTVTHVNEVKAGINHIVVDSVQYGNQEMIMEKDGTV...,0
439,APC7603,MSETATWQPSASIPNLLKRAAIMAEIRRFFADRGVLEVETPCMSQA...,0
519,hsk002000585.2,RRAGSVKRGEARLFGPTERQSERPLRPSAARRPEMLSGKKAAAAAA...,0
881,GO.35285,MARRKRRNFSKQASEILNEYFYSHLSNPYPSEEAKEELARKCGITV...,0
882,GO.36643,SEKLAASTEPQGPRPVLGRESVQVPDDQDFRSFRSECEAEVGWNLT...,0


In [10]:
no_xtal.shape

(3913, 3)

In [11]:
df_xtal = crystals.sample(n=400)
df_no_xtal = no_xtal.sample(n=400)

In [12]:
df_xtal.shape

(400, 3)

In [13]:
df_no_xtal.shape

(400, 3)

In [14]:
# concat the two dataframes
df_rand = pd.concat([df_xtal,df_no_xtal])
# permute/scramble/shuffle rows of the dataframe
df_rand = df_rand.sample(frac=1)
df_rand.head(3)

Unnamed: 0,id,sequence,result
2592,MytuD.00386.a,MSDLVRVERKGRVTTVILNRPASRNAVNGPTAAALCAAFEQFDRDD...,0
220,APC1773,MKLIAIDLDGTLLNSKHQVSLENENALRQAQRDGIEVVVSTGRAHF...,1
702,ttk003001467.1,MARFALVLHAHLPYVRAHGMWPFGEETLYEAMAETYLPLIRVLERL...,1


In [15]:
df_rand.shape

(800, 3)

In [16]:
df_rand.dtypes

id          object
sequence    object
result       int64
dtype: object

## Transform the data first with countVectorizer, then split the data

In [17]:
vect_3 = CountVectorizer(min_df=1,token_pattern=r'\w{1}',ngram_range=(3, 3))
#vect_3.fit(df.sequence)

In [18]:
X = vect_3.fit_transform(df_rand.sequence)
y = df_rand.result

## Split the data into training & test sets

In [19]:
# Note: will try to split the data first, then transform with the vectorizer
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state =42)

In [20]:
# Count the number of 0's(failures) & 1's,(success) in the result column
y_train.value_counts()

0    324
1    316
Name: result, dtype: int64

In [21]:
X_train.shape

(640, 7682)

In [22]:
# Count the number of 0's(failures) & 1's,(success) in the result column
y_test.value_counts()

1    84
0    76
Name: result, dtype: int64

In [23]:
X_train

<640x7682 sparse matrix of type '<class 'numpy.int64'>'
	with 147256 stored elements in Compressed Sparse Row format>

In [24]:
X_test

<160x7682 sparse matrix of type '<class 'numpy.int64'>'
	with 37224 stored elements in Compressed Sparse Row format>

In [25]:
y_train

2854    0
821     1
2380    0
3664    0
314     1
295     1
797     1
8       1
311     1
231     1
4202    0
454     1
200     1
760     1
397     1
442     1
1051    0
410     1
274     1
3945    0
3983    0
2003    0
11      1
254     1
796     1
879     1
184     1
3631    0
3684    0
524     1
       ..
1368    0
2764    0
4098    0
4720    0
1302    0
2310    0
3884    0
4711    0
2710    0
148     1
2387    0
3267    0
689     1
810     1
697     1
849     1
858     1
2672    0
830     1
228     1
4474    0
329     1
3752    0
650     1
854     1
600     1
2840    0
373     1
1394    0
75      1
Name: result, Length: 640, dtype: int64

In [26]:
y_test

166     1
2737    0
1634    0
3864    0
3092    0
766     1
1758    0
115     1
4120    0
4701    0
644     1
37      1
535     1
2728    0
71      1
2903    0
4583    0
380     1
2292    0
466     1
357     1
866     1
1089    0
4236    0
632     1
1748    0
889     0
4605    0
493     1
435     1
       ..
226     1
133     1
476     1
509     1
120     1
744     1
1212    0
730     1
3594    0
3905    0
768     1
364     1
49      1
570     1
800     1
4258    0
3790    0
85      1
97      1
585     1
4494    0
702     1
864     1
805     1
2800    0
110     1
84      1
198     1
856     1
802     1
Name: result, Length: 160, dtype: int64

In [27]:
# Logistic Regression using CountVectorizer for tripeptide frequency
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(X_test)
print("Score: {:.2f}".format(lr.score(X_test, y_test)))

Score: 0.99


## It makes no difference if the entire text data is transformed by the vectorizer first and then split into training & test data sets or splitting the data first and then transform the data with the vectorizer... the accuracy is still high

## Try Logistic Regression on Entire data, unbalanced data set.. more failed to crystallize than those that did crystallize

In [28]:
# scramble the data before using and splitting
data = df.sample(frac=1)
data.shape

(4791, 3)

In [29]:
df.head(3)

Unnamed: 0,id,sequence,result
0,ATCG3D_18,DYKDDDDAMGQPGNGSAFLLAPNRSHAPDHDVTQQRDEVWVVGMGI...,1
1,BSGCAIR30348,MDKKYDITAVLNEDSSMTAISDQFQITLDARPKHTAKGFGPLAALL...,1
2,BSGCAIR30512,MTESFTRRERLRLRRDFLLIFKEGKSLQNEYFVVLFRKNGLDYSRL...,1


In [30]:
data.head(3)

Unnamed: 0,id,sequence,result
1425,MytuD.00386.a,MSDLVRVERKGRVTTVILNRPASRNAVNGPTAAALCAAFEQFDRDD...,0
2308,MytuD.00386.a,MSDLVRVERKGRVTTVILNRPASRNAVNGPTAAALCAAFEQFDRDD...,0
3590,MytuD.00386.a,MSDLVRVERKGRVTTVILNRPASRNAVNGPTAAALCAAFEQFDRDD...,0


In [31]:
data.sequence[0]

'DYKDDDDAMGQPGNGSAFLLAPNRSHAPDHDVTQQRDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIWTLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKHHHHHH'

In [32]:
data.groupby('result')['result'].value_counts()

result  result
0       0         3913
1       1          878
Name: result, dtype: int64

In [33]:
#X = data.sequence
vector_3 = CountVectorizer(min_df=1,token_pattern=r'\w{1}',ngram_range=(3, 3))
X = vector_3.fit_transform(data.sequence)
y = data.result

In [34]:
X.shape

(4791, 7934)

In [35]:
# Now split the data
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2,random_state =42)

In [36]:
X_train.shape

(3832, 7934)

In [37]:
X_test.shape

(959, 7934)

In [38]:
# Count the number of 0's(failures) & 1's,(success) in the result column
y_train.value_counts()

0    3122
1     710
Name: result, dtype: int64

In [39]:
# Count the number of 0's(failures) & 1's,(success) in the result column
y_test.value_counts()

0    791
1    168
Name: result, dtype: int64

In [40]:
X_train

<3832x7934 sparse matrix of type '<class 'numpy.int64'>'
	with 928971 stored elements in Compressed Sparse Row format>

In [41]:
X_test

<959x7934 sparse matrix of type '<class 'numpy.int64'>'
	with 231049 stored elements in Compressed Sparse Row format>

In [42]:
# Logistic Regression using CountVectorizer for tripeptide frequency
lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
print("Score: {:.2f}".format(lr.score(X_test, y_test)))

Score: 0.99


In [43]:
predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1,