In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stop_words = ['हूँ', 'हो','हूं', 'मैं','में','तू', 'है', 'हैं','अथव', 'अद', 'अध', 'अन', 'अपन', 'अभ', 'अल', 'आग', 'आद', 'आपक', 'इत', 'इतय', 'इनक', 'इनस', 'इसक', 'इसम', 'इसल', 'उनक', 'उनस', 'उसक', 'एव', 'ऐस', 'कभ', 'करत', 'करन', 'कह', 'कहत', 'गय', 'जबक', 'जर', 'जह', 'झक', 'तथ', 'तन', 'तर', 'दब', 'दर', 'दव', 'धर', 'नक', 'नस', 'नह', 'पड', 'पहल', 'बड', 'बन', 'बह', 'यत', 'यद', 'यम', 'रख', 'रत', 'रव', 'रह', 'रहत', 'लक', 'वग', 'वय', 'वर', 'वग़', 'सक', 'सकत', 'सबस', 'सभ', 'सम', 'सर', 'सस', 'हमन', 'हर', 'था', 'दें', 'थी','ले', 'लो', 'थे', 'होगा', 'होगी', 'होंगे', 'ख़ास', 'बहुत', 'बार', 'वाले', 'वाली', 'वाला', 'जब', 'जहाँ', 'जा', 'जिस', 'जिन्हें', 'जिन्हों', 'जिसे', 'जिसका', 'जिसकी','जिसके', 'जिसमें', 'जिधर', 'के', 'का', 'की', 'को', 'कि', 'इस', 'उस', 'उसे', 'उन', 'उन्हें', 'उन्हों', 'उनका', 'उनकी', 'उनके','उनसे', 'अपना', 'अपनी', 'अपने', 'आदि', 'इत्यादि', 'इन्हें', 'इन्हों', 'इनका', 'इनकी', 'इनके', 'इनसे', 'जैसा', 'जैसे','अंदर', 'अत', 'अदि', 'अप', 'अपना', 'अपनि', 'अपनी', 'अपने', 'अभि', 'अभी', 'आदि', 'आप', 'इंहिं', 'इंहें', 'इंहों', 'इतयादि', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकि', 'इसकी', 'इसके', 'इसमें', 'इसि', 'इसी', 'इसे', 'उंहिं', 'उंहें', 'उंहों', 'उन', 'उनका', 'उनकि', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसि', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'एसे', 'ऐसे', 'ओर', 'और', 'कइ', 'कई', 'कर', 'करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफि', 'काफ़ी', 'कि', 'किंहें', 'किंहों', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसि', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोइ', 'कोई', 'कोन', 'कोनसा', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जहां', 'जा', 'जिंहें', 'जिंहों', 'जितना', 'जिधर', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जेसा', 'जेसे', 'जैसा', 'जैसे' , 'तैसा', 'तैसे', 'इसलिए', 'इसके अलावा', 'फिर', 'अगर', 'कि', 'की', 'के बारे में', 'किसी तरह', 'कोई', 'कुछ', 'कुल','जितना', 'तक', 'तो', 'थी', 'थे', 'था', 'ने', 'पर', 'जा', 'जो', 'सबसे', 'संग','से', 'तक', 'साथ', 'ही', 'हुआ', 'हुई', 'हुए', 'होता', 'होती', 'ह']
# remove duplicate stop words
stop_set = set()
for word in stop_words:
  stop_set.add(word)
print("No. of stop words: ", len(stop_set))

No. of stop words:  235


In [5]:
with open('/content/drive/MyDrive/Colab Notebooks/final_stopwords.txt', 'r', encoding='utf8') as file:
    for line in file:
        word = line.strip()  # remove newline character from the end of the line
        stop_set.add(word)  # add the word to the set
print("No. of stop words: ", len(stop_set))

No. of stop words:  422


In [6]:
def remove_stopwords_hindi(text):
    tokens = word_tokenize(text)
    # print(tokens)
    filtered_tokens = [word for word in tokens if word not in stop_set]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def remove_stopwords_english(text):
    tokens = word_tokenize(text)
    # print(tokens)
    filtered_tokens = [word for word in tokens if word not in set(stopwords.words('english'))]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [7]:
# removing punctuations
def remove_punctuations(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

In [8]:
# # tokenize and check unique words
def tokenize_unique_save(col):
  unique = set()
  for cell in col:
    tokens = word_tokenize(cell)
    for token in tokens:
      unique.add(token)
  return unique


In [9]:
!pip install indic_transliteration emot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting indic_transliteration
  Downloading indic_transliteration-2.3.44-py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting roman
  Downloading roman-4.0-py3-none-any.whl (7.8 kB)
Collecting backports.functools-lru-cache
  Downloading backports.functools_lru_cache-1.6.4-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: emot, roman, backports.functools-lru-cache, indic_transliteration
Successfully installed backports.functools-lru-cache-1.6.4 emot-3.1 indic_transliteration-2.3.44 roman-4.0


In [10]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Define preprocessing functions
def preprocess_hindi_text(text):
    # Remove unnecessary symbols
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    return text

def transliterate_hindi(text):
    # Transliterate Hindi text to English
    english_text = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
    return english_text.lower()

import unicodedata

def extract_emojis(text):
    # Extract emojis from text
    emojis = ''.join(c for c in text if c in ''.join(chr(i) for i in range(0x1F300, 0x1F6FF+1)))
    return emojis

def extract_raw_english(text):
    # Remove unnecessary symbols
    text = re.sub(r'[^a-z^A-Z]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    return text

In [11]:
from emot.emo_unicode import UNICODE_EMOJI
 # Function for converting emojis into word
def convert_emojis(text):
    for e in UNICODE_EMOJI:
        text = text.replace(e, (UNICODE_EMOJI[e]+' ').replace(':',''))
    return text

convert_emojis('😍😎👸👈🎠👫👸👈👗💃👀👩😋😍👈👸👰 पगली बहोत शोकीन तुजे डोली बिठाके लेके तेरे कपड़...')

'smiling_face_with_heart-eyes smiling_face_with_sunglasses princess backhand_index_pointing_left carousel_horse woman_and_man_holding_hands princess backhand_index_pointing_left dress woman_dancing eyes woman face_savoring_food smiling_face_with_heart-eyes backhand_index_pointing_left princess person_with_veil  पगली बहोत शोकीन तुजे डोली बिठाके लेके तेरे कपड़...'

In [12]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/abusive_text.csv')
df.head()

Unnamed: 0,label,text
0,0,भीड़ में बहुत हब्सी मिलेंगे
1,0,साले बेवकूफ अपनी मां मक्खियां तो हटा दें
2,0,बुर देदो तो मुह में लंड ले लो तो
3,0,कुत्ता वहा है चिल्ला तू क्यों रहा है
4,1,चाय नहीं पीता हूं मैं इसी को छोड़ दिया ok


In [13]:
############################################################################# START PRE_PROCESSING ###################################################################################

# remove punctuation from text
df['Clean Text'] = df['text'].apply(remove_punctuations)

# convert emoji to text
df['Clean Text'] = df['Clean Text'].apply(convert_emojis)

# remove digits from text
df['Clean Text'] = df['Clean Text'].str.replace('\d+',' ')

# remove stop words from hindi
df['Clean Text'] = df['Clean Text'].apply(remove_stopwords_hindi)

df['final_text'] = df['Clean Text']

  df['Clean Text'] = df['Clean Text'].str.replace('\d+',' ')


In [14]:
traindf, valdf = train_test_split(df, train_size=0.8)

In [15]:
import torch
import torch.nn as nn
from torchtext import data
from torchtext import datasets
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable 
from tqdm import tqdm

from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [17]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hindi_test.csv')

In [18]:
# preprocess data
# remove punctuation from text
test_df['Clean Text'] = test_df['text'].apply(remove_punctuations)

# convert emoji to text
test_df['Clean Text'] = test_df['Clean Text'].apply(convert_emojis)

# remove digits from text
test_df['Clean Text'] = test_df['Clean Text'].str.replace('\d+',' ')

# remove stop words from hindi
test_df['Clean Text'] = test_df['Clean Text'].apply(remove_stopwords_hindi)

test_df['final_text'] = test_df['Clean Text']

test_df.head(10)

  test_df['Clean Text'] = test_df['Clean Text'].str.replace('\d+',' ')


Unnamed: 0,label,text,Clean Text,final_text
0,0,मैं ये नही सोच रहा की इसे निकले कैसे मैं ये सो...,नही सोच निकले सोच फंसा कैसेface_with_tears_of_...,नही सोच निकले सोच फंसा कैसेface_with_tears_of_...
1,1,और दिवाली में भी पूरा देश पड़ाका नहीं फोडात,दिवाली देश पड़ाका फोडात,दिवाली देश पड़ाका फोडात
2,1,कुत्ता बिल्ली पाल लेना मगर गलत फहमी कभी नहीं।,कुत्ता बिल्ली पाल लेना गलत फहमी नहीं।,कुत्ता बिल्ली पाल लेना गलत फहमी नहीं।
3,0,तेरी गांड में प्याज काट देगा गुज्जर भोसड़ी के,तेरी गांड प्याज काट देगा गुज्जर भोसड़ी,तेरी गांड प्याज काट देगा गुज्जर भोसड़ी
4,1,बंगाली साड़ी ऐसे नहीं पहना जाता है दीदी,बंगाली साड़ी पहना दीदी,बंगाली साड़ी पहना दीदी
5,1,ऐ इंडिया है यह आदमी दो बार जीता है एक बार मरत...,ऐ इंडिया आदमी जीता मरता,ऐ इंडिया आदमी जीता मरता
6,1,अक्कड़ बक्कड़ बंबे बो डीजल नब्बे पेट्रोल सौ ...,अक्कड़ बक्कड़ बंबे बो डीजल नब्बे पेट्रोल सौ सौ...,अक्कड़ बक्कड़ बंबे बो डीजल नब्बे पेट्रोल सौ सौ...
7,1,एक तीर एक कमान आदिवासी एक समान एक तीर एक कमान ...,तीर कमान आदिवासी तीर कमान जय श्रीराम जय श्रीरा...,तीर कमान आदिवासी तीर कमान जय श्रीराम जय श्रीरा...
8,1,आपका बहुत बड़ा फैन हूं असद ओवैसी साहब मैं आपका...,फैन असद ओवैसी साहब फैन मुजम्मिल थाली number,फैन असद ओवैसी साहब फैन मुजम्मिल थाली number
9,0,तुम सब चूतिया हो रोटी राम,सब चूतिया रोटी राम,सब चूतिया रोटी राम


In [19]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [20]:
import transformers
from transformers import BertModel
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [21]:
X = list(df['final_text'].values)
y = list(df['label'])

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [23]:
X_train[0:5]

['जल्दी उठ जाया करो जी',
 'रस गुला फट रे फट जलेबी लिपट रे',
 'रात पलग तोडेगे',
 'दिल हू',
 'गांडु हिंदुराष्ट्र आहे बर गांडुवानी नको वागु']

In [24]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

In [25]:
# mBert
def test_mBert(testdf):
  batch_size = 32
  X = list(testdf['final_text'].values)
  y = list(testdf['label'])

  # Load the saved model
  model_path = '/content/drive/MyDrive/Colab Notebooks/models/mBert_model.pth'
  # Instantiate the model
  model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

  # Load the saved weights
  state_dict = torch.load(model_path)
  # print(state_dict)

  # Load the state_dict into the model
  model.load_state_dict(state_dict.state_dict())
  model.to(device)
  model.eval()

  # Load the tokenizer
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

  val_encodings = tokenizer(X, padding=True, truncation=True, max_length=200)
  val_labels = torch.tensor(y)

  val_inputs = torch.tensor(val_encodings['input_ids'])
  val_masks = torch.tensor(val_encodings['attention_mask'])

  val_data = TensorDataset(val_inputs, val_masks, val_labels)
  val_dataloader = DataLoader(val_data, batch_size=batch_size)

  # Validation loop
  val_loss = 0
  val_acc = 0
  total_f1 = 0
  nb_eval_steps = 0
  running_loss_val = 0
  correct_val = 0
  total_val = 0
  total_step_val = len(val_dataloader)

  model.eval()
  with torch.no_grad():
    for batch in tqdm(val_dataloader):
        inputs, masks, labels = batch
        inputs = inputs.to(device)
        masks = masks.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs[0]
        val_loss += loss.item()
        predictions = torch.max(outputs[1], dim=1)
        val_acc += torch.sum(predictions[1] == labels).item()
        total_f1 += f1_score(labels.cpu(), predictions[1].cpu(), average='macro')
        nb_eval_steps += 1 
    val_loss /= len(val_dataloader)
    val_acc = float(val_acc) / float(len(val_data))
    avg_f1 = total_f1 / nb_eval_steps

  # Print epoch results
  print(f"\n\nVal loss: {val_loss:.4f} - Val accuracy: {val_acc:.4f}")
  print(f"Macro-f1: {100*avg_f1:.4f}")

test_mBert(test_df)

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

100%|██████████| 211/211 [01:10<00:00,  2.99it/s]



Val loss: 0.4246 - Val accuracy: 0.8016
Macro-f1: 79.6498





In [26]:
# mBert
def test_MURiL(testdf):
  batch_size = 32

  X = list(testdf['final_text'].values)
  y = list(testdf['label'])

  # Load the saved model
  model_path = '/content/drive/MyDrive/Colab Notebooks/models/muril_model.pth'
  # Instantiate the model
  model = AutoModelForSequenceClassification.from_pretrained('google/muril-base-cased')

  # Load the saved weights
  state_dict = torch.load(model_path)
  # print(state_dict)

  # Load the state_dict into the model
  model.load_state_dict(state_dict.state_dict())
  model.to(device)
  model.eval()

  # Load the tokenizer
  tokenizer = AutoTokenizer.from_pretrained('google/muril-base-cased')

  val_encodings = tokenizer(X, padding=True, truncation=True, max_length=200)
  val_labels = torch.tensor(y)

  val_inputs = torch.tensor(val_encodings['input_ids'])
  val_masks = torch.tensor(val_encodings['attention_mask'])

  val_data = TensorDataset(val_inputs, val_masks, val_labels)
  val_dataloader = DataLoader(val_data, batch_size=batch_size)

  # Validation loop
  val_loss = 0
  val_acc = 0
  total_f1 = 0
  nb_eval_steps = 0
  running_loss_val = 0
  correct_val = 0
  total_val = 0
  total_step_val = len(val_dataloader)

  model.eval()
  with torch.no_grad():
    for batch in tqdm(val_dataloader):
        inputs, masks, labels = batch
        inputs = inputs.to(device)
        masks = masks.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs[0]
        val_loss += loss.item()
        predictions = torch.max(outputs[1], dim=1)
        val_acc += torch.sum(predictions[1] == labels).item()
        total_f1 += f1_score(labels.cpu(), predictions[1].cpu(), average='macro')
        nb_eval_steps += 1 
    val_loss /= len(val_dataloader)
    val_acc = float(val_acc) / float(len(val_data))
    avg_f1 = total_f1 / nb_eval_steps

  # Print epoch results
  print(f"\n\nVal loss: {val_loss:.4f} - Val accuracy: {val_acc:.4f}")
  print(f"Macro-f1: {100*avg_f1:.4f}")

test_MURiL(test_df)

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not in

Downloading (…)okenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

100%|██████████| 211/211 [01:13<00:00,  2.89it/s]



Val loss: 0.5819 - Val accuracy: 0.8365
Macro-f1: 82.9832





In [27]:
def print_intersection_points(train, val, test):
  train_col = set(train)
  test_col = set(test)
  # Count the number of common rows between the two dataframes
  common_rows = len(train_col.intersection(test_col))
  print("Number of common rows between train and test:", common_rows)

  # Extract the column of interest from validation and test dataframes
  valid_col = set(val)

  # Count the number of common rows between the two dataframes
  common_rows = len(valid_col.intersection(test_col))
  print("Number of common rows between validation and test:", common_rows)

print_intersection_points(X_train, X_val, list(test_df['final_text'].values))

Number of common rows between train and test: 63
Number of common rows between validation and test: 28
