In [3]:
'''

1. The columns chosen are ["characteristics_ch1","molecule_ch1","organism_ch1",'type',"title"] after eliminating the less useful feautures.

2. We use semi supervised learning here to increase the training data. Initially the labelled data available wasn't much so we trained
a model on that and then calculated labels for the unlabelled data.

3. So Now data size has been increased to 15,000 samples. We then trained the classification model on this extended data.

4. The choice of model did not make much change to the accuracy as the main issue was the lack of data.



'''

'\n\n1. The columns chosen are ["characteristics_ch1","molecule_ch1","organism_ch1",\'type\',"title"] after eliminating the less useful feautures.\n\n2. We use semi supervised learning here to increase the training data. Initially the labelled data available wasn\'t much so we trained\na model on that and then calculated labels for the unlabelled data.\n\n3. So Now data size has been increased to 15,000 samples. We then trained the classification model on this extended data.\n\n4. The choice of model did not make much change to the accuracy as the main issue was the lack of data.\n\n\n\n'

## Importing required libraries

In [12]:
# Used in all sections for managing data and files
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle
import re

# NTLK is used for preprocessing text. You can find out more about each module using their documentation.
import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import inaugural, stopwords
from wordcloud import WordCloud, STOPWORDS

# Scikit-Learn is used for feature extraction and training a logistic regression model
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
labelled_training_data_path = '/content/labelled_train_data.csv'
train_df = pd.read_csv(labelled_training_data_path)

## Exploring the data

In [6]:
train_df

Unnamed: 0.1,Unnamed: 0,geo_accession,gse_id,ctrl,pert,channel_count,characteristics_ch1,contact_address,contact_city,contact_country,...,extract_protocol_ch2,label_ch2,label_protocol_ch2,molecule_ch2,organism_ch2,source_name_ch2,taxid_ch2,treatment_protocol_ch2,biomaterial_provider_ch2,growth_protocol_ch2
0,0,GSM1617977,GSE66250,0.0,1.0,1,facs sorting: CD44low/CD24high,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
1,1,GSM1617983,GSE66250,0.0,1.0,1,facs sorting: Unsorted,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
2,2,GSM1617982,GSE66250,1.0,0.0,1,facs sorting: CD44low/CD24high,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
3,3,GSM1617975,GSE66250,0.0,1.0,1,facs sorting: CD44high/CD24low,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
4,0,GSM1267968,GSE52505,0.0,1.0,1,tissue: human nasal polyp,"148, Gurodong-ro, Guro-gu",Seoul,South Korea,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,0,GSM1462977,GSE59980,0.0,1.0,1,cell line: MCF7,1450 Biggy Street NRT6514,Los Angeles,USA,...,,,,,,,,,,
619,1,GSM1462972,GSE59980,1.0,0.0,1,cell line: MCF7,1450 Biggy Street NRT6514,Los Angeles,USA,...,,,,,,,,,,
620,2,GSM1462974,GSE59980,1.0,0.0,1,cell line: MCF7,1450 Biggy Street NRT6514,Los Angeles,USA,...,,,,,,,,,,
621,3,GSM1462976,GSE59980,0.0,1.0,1,cell line: MCF7,1450 Biggy Street NRT6514,Los Angeles,USA,...,,,,,,,,,,


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 53 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                623 non-null    int64  
 1   geo_accession             623 non-null    object 
 2   gse_id                    623 non-null    object 
 3   ctrl                      623 non-null    float64
 4   pert                      623 non-null    float64
 5   channel_count             623 non-null    int64  
 6   characteristics_ch1       623 non-null    object 
 7   contact_address           623 non-null    object 
 8   contact_city              623 non-null    object 
 9   contact_country           623 non-null    object 
 10  contact_department        503 non-null    object 
 11  contact_email             394 non-null    object 
 12  contact_institute         623 non-null    object 
 13  contact_name              623 non-null    object 
 14  contact_st

## Feature seletction
#### Finding the appropriate,useful features to achieve the best possible performance of ML algorithm

In [8]:
'''
After alot of experimentation, analysis, study following 5 features gave me the best accuracy-

["characteristics_ch1","molecule_ch1","organism_ch1",'type',"title"]
'''


cols=["characteristics_ch1","molecule_ch1","organism_ch1",'type',"title"] 

train_df.loc[:, 'feature'] = ""
# We are concatenating all the columns with a space.
train_df['feature'] = train_df.apply(lambda row: ' '.join([str(row[i]) for i in cols]), axis=1)
stop_words = set(stopwords.words('english'))


In [9]:
train_df["channel_count"].value_counts()

1    597
2     26
Name: channel_count, dtype: int64

In [10]:
train_df["feature"]

0      facs sorting: CD44low/CD24high polyA RNA Homo ...
1      facs sorting: Unsorted polyA RNA Homo sapiens ...
2      facs sorting: CD44low/CD24high polyA RNA Homo ...
3      facs sorting: CD44high/CD24low polyA RNA Homo ...
4      tissue: human nasal polyp total RNA Homo sapie...
                             ...                        
618    cell line: MCF7 total RNA Homo sapiens SRA RNA...
619    cell line: MCF7 total RNA Homo sapiens SRA RNA...
620    cell line: MCF7 total RNA Homo sapiens SRA RNA...
621    cell line: MCF7 total RNA Homo sapiens SRA RNA...
622    cell line: MCF7 total RNA Homo sapiens SRA RNA...
Name: feature, Length: 623, dtype: object

In [13]:
def preprocess(data_df):
    data_df['cleaned_feature'] = ''
    stop_words = set(stopwords.words('english'))
    wordnet_lemm = WordNetLemmatizer()
    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        sample = row['feature']
        
        pre_txt = re.sub(r"[^a-zA-Z0-9- ]", " ", sample)
        pre_txt = pre_txt.lower()
        sample_words = [wordnet_lemm.lemmatize(w) for w in pre_txt.split() if w not in stop_words and len(w)>1]
        pre_proc_ver = ' '.join(sample_words)
        data_df.loc[index, 'cleaned_feature'] = pre_proc_ver
    return data_df
        
# Cleaned Training set
cleaned_train_df = preprocess(train_df.copy())


100%|██████████| 623/623 [00:01<00:00, 328.13it/s]


In [14]:
cleaned_train_df["cleaned_feature"][0]

'facs sorting cd44low cd24high polya rna homo sapiens sra rep1 cd44low dox'

## Feature extraction and encoding 

In [None]:
'''
Bag of words (countvectorizer) approach gave me the best score. May be because our input features had 
large vocabulary of different/unique words that was resulting into correct predictions as TFIDF works on the
principle of frequency of words, because of the uniqueness of words the frequency was going to be 1 for most of the cases.
'''




'\nI had extensive experimentation between TFIDF and bag-of-words.  \nBag of words (countvectorizer) approach gave me the best score. May be because our input features had \nlarge vocabulary of different/unique words that was resulting into correct predictions as TFIDF works on the\nprinciple of frequency of words, because of the uniqueness of words the frequency was going to be 1 for most of the cases.\n'

#### a) Extracting features from unlabelled data

In [15]:
vect = CountVectorizer(analyzer="word", preprocessor=None, stop_words=stop_words, max_features=15000)

In [23]:

unlabelled_df=pd.read_csv("/content/unlabelled_train_data.csv").sample(15000,random_state=42)

unlabelled_df['feature'] = unlabelled_df.apply(lambda row: ' '.join([str(row[i]) for i in cols]), axis=1)
cleaned_unlabelled_df = preprocess(unlabelled_df.copy())

vect=vect.fit(cleaned_unlabelled_df['cleaned_feature'])
X_unlabelled=vect.transform(cleaned_unlabelled_df['cleaned_feature'])
X_unlabelled=X_unlabelled.toarray()

100%|██████████| 15000/15000 [00:13<00:00, 1124.88it/s]


#### b) Extracting features from labelled data

In [24]:

X_train_vect = vect.transform(cleaned_train_df['cleaned_feature'])
y_train = cleaned_train_df['ctrl'].astype(int).tolist()

X = vect.transform(cleaned_train_df['cleaned_feature']).toarray()
Y = cleaned_train_df['ctrl'].astype(int).tolist()


### Spliting into training and testing data (labelled)

In [25]:
from sklearn.model_selection import train_test_split

X_train_vect,X_valid_vect,y_train,y_valid =train_test_split(X_train_vect,y_train,test_size=0.15,\
                                                            stratify=y_train,random_state=42)
X_train_vect=X_train_vect.toarray()
X_valid_vect=X_valid_vect.toarray()
# X_train_vect

In [26]:
X_train_vect.shape

(529, 8507)

# Weekly supervised learning (Data Curation)

In [27]:
X_unlabelled.shape

(15000, 8507)

In [29]:

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import f1_score,roc_auc_score


In [30]:
model=LogisticRegression(max_iter=1000)
model.fit(X_train_vect, y_train)
print("Classification report for ctrl (1 - ctrl, 0 - pert)\n")
y_predictions = model.predict(X_valid_vect)
print(classification_report(y_valid, y_predictions))
    

Classification report for ctrl (1 - ctrl, 0 - pert)

              precision    recall  f1-score   support

           0       0.95      0.88      0.91        48
           1       0.88      0.96      0.92        46

    accuracy                           0.91        94
   macro avg       0.92      0.92      0.91        94
weighted avg       0.92      0.91      0.91        94



In [31]:

len(y_predictions)

94

In [32]:
pd.Series(y_predictions).value_counts()

1    50
0    44
dtype: int64

### Concatenating the labelled and unlabelled data after generating the labels 
#### So now we have around 15,000 datapoints to train the final classification model

In [33]:
X=np.concatenate([X,X_unlabelled],axis=0)
Y=np.concatenate([Y,y_predictions],axis=0)

print(f"X: {X.shape}, Y: {Y.shape}")

X: (15623, 8507), Y: (717,)


In [34]:
X_train_vect,X_valid_vect,y_train,y_valid =train_test_split(X,Y,test_size=0.20,\
                                                            stratify=Y,random_state=42)

ValueError: ignored

In [None]:
X_train_vect.shape

(12498, 9566)

## Training the final classification model
#### Experimentating with different types of models

In [None]:
model=LogisticRegression(max_iter=1000)

model.fit(X_train_vect, y_train)
print("Classification report for ctrl (1 - ctrl, 0 - pert)\n")
y_predictions = model.predict(X_valid_vect)
print(classification_report(y_valid, y_predictions))



Classification report for ctrl (1 - ctrl, 0 - pert)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2307
           1       0.99      0.94      0.97       818

    accuracy                           0.98      3125
   macro avg       0.98      0.97      0.98      3125
weighted avg       0.98      0.98      0.98      3125



In [None]:

test_df = pd.read_csv('../input/mlrw-biomedicalhackathon/data_only_test.csv')

test_df.loc[:, 'feature'] = ""
test_df['feature'] = test_df.apply(lambda row: ' '.join([str(row[i]) for i in cols]), axis=1)

cleaned_test_df = preprocess(test_df.copy())
X_test_vect = final_vect.transform(cleaned_test_df['cleaned_feature'].tolist())
X_test_vect=X_test_vect.toarray()

100%|██████████| 6070/6070 [00:03<00:00, 1844.66it/s]


In [None]:

test_predictions = model.predict(X_test_vect)
cleaned_test_df['ctrl'] = test_predictions

cleaned_test_df = cleaned_test_df[['geo_accession', 'ctrl']]

# make sure its a float!
cleaned_test_df['ctrl'] = cleaned_test_df['ctrl'].astype(np.float64)

cleaned_test_df.to_csv('submission.csv', index=False)