In [None]:
import pandas as pd
from tensorflow import keras as tf
import tensorflow as T
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Augmenting data

In [None]:
def augment_neutral_class(df, n):
    neutral = df[df['target'] == 1]
    new_instances = []
    for i in range(2, 11):
        for _ in range(n):
            instance = neutral.sample(n=i, replace=True)
            augmented_data = ', '.join(instance['text'].values)
            if augmented_data.count(' ') < 374:
                new_instances.append(augmented_data)
    x = list(set(new_instances))
    if len(x) > 500000:
      return x[:500000]
    return x

# Transformers

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/IFT6390/Dataset/dataset_final_fr.csv')
#dataset = pd.read_csv('/content/drive/MyDrive/Kaggle-2/dataset.csv')

# augmented_neutral_data = augment_neutral_class(dataset, 100000)
# dataset = dataset.append(pd.DataFrame({'text': augmented_neutral_data, 'target': 1}), ignore_index=True)


In [None]:
vocab_size = 25000
maxlen = 374

X = dataset['text'].astype(str).values.tolist()
tokenizer = tf.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = tf.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

Y = tf.utils.to_categorical(dataset['target'].values.tolist(), 3)

In [None]:
initializer = tf.initializers.HeNormal()

In [None]:
class TransformerBlock(tf.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, kernel_initializer=initializer)

        self.ffn = tf.Sequential(
            [
                tf.layers.Dense(ff_dim, activation="relu", kernel_initializer=initializer), 
                tf.layers.Dense(embed_dim, kernel_initializer=initializer),
            ] # Attention is all you need : Just uses activation in between
        )

        self.layernorm1 = tf.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.layers.Dropout(rate)
        self.dropout2 = tf.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = T.shape(x)[-1]
        positions = T.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def transformer():
  embed_dim = 32  
  num_heads = 2
  ff_dim = 512 

  embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
  # transformer_block_1 = TransformerBlock(embed_dim, num_heads, ff_dim)

  inputs = tf.layers.Input(shape=(maxlen))
  x_ip = embedding_layer(inputs)
  # x = transformer_block_1(x_ip)
  # tx = tf.layers.Dropout(0.1)(x)
  # tx = tf.layers.GlobalMaxPool1D()(tx)
  # tx = tf.layers.Dense(512, 'relu', kernel_initializer=initializer)(tx)
  # tx = tf.layers.Dense(128, 'relu', kernel_initializer=initializer)(tx)

  lx = tf.layers.LSTM(128, return_sequences=True, kernel_initializer=initializer)(x_ip)
  # lx = tf.layers.Flatten()(lx)
  lx = tf.layers.LSTM(128, kernel_initializer=initializer)(lx)

  cx = tf.layers.Conv1D(filters=128, kernel_size=4, activation="relu", kernel_initializer=initializer)(x_ip)
  cx = tf.layers.Conv1D(filters=128, kernel_size=4, activation="relu", kernel_initializer=initializer)(cx)
  cx = tf.layers.GlobalMaxPooling1D()(cx)

  x = tf.layers.Multiply()([lx, cx])
  x = tf.layers.Dropout(0.1)(x)
  x = tf.layers.Dense(1024, 'relu', kernel_initializer=initializer)(x)
  x = tf.layers.Dense(2700, 'relu', kernel_initializer=initializer)(x)

  x_2d = tf.layers.Reshape((30,30,3))(x)
  cx_2d = tf.layers.Conv2D(16,3, activation='relu', kernel_initializer=initializer)(x_2d)
  cx_2d = tf.layers.Conv2D(8,3, activation='relu', kernel_initializer=initializer)(cx_2d)
  cx_2d = tf.layers.Conv2D(3,5, activation='relu', kernel_initializer=initializer)(cx_2d)
  x = tf.layers.Conv1D(filters=256, kernel_size=4, activation="relu", kernel_initializer=initializer)(cx_2d)
  x = tf.layers.Flatten()(x)
  x = tf.layers.Dense(128, kernel_initializer=initializer, activation="relu")(x)

  outputs = tf.layers.Dense(3, kernel_initializer=initializer, activation="softmax")(x)

  model = tf.Model(inputs=inputs, outputs=outputs)
  return model

model = transformer()
model.compile('adam', loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
print(X.shape, Y.shape)

(1040323, 374) (1040323, 3)


In [None]:
history = model.fit(
    X,
    Y, 
    batch_size=512, 
    epochs=1, 
    validation_split=0.25
)
# Having validation loss < training loss might be counter-intuitive, it may be because of dropout layers in the model which
# causes the model to behave differently during training and inference. 

  77/1524 [>.............................] - ETA: 18:39:52 - loss: 0.7247 - accuracy: 0.5562

In [None]:
model.save('/content/drive/MyDrive/Kaggle-2/Transformer-30-11-2.0')

In [None]:
# dataset = pd.read_csv('/content/drive/MyDrive/IFT6390/Kaggle-2/test.csv')
dataset = pd.read_csv('/content/drive/MyDrive/Kaggle-2/test.csv')
print(dataset.shape)
test = dataset['text'].astype(str).values.tolist()
print(len(test))

(560175, 1)
560175


In [None]:
sequences = tokenizer.texts_to_sequences(test)
test = tf.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
predictions = model.predict(np.array(test))
preds = np.argmax(predictions, 1)
csv = 'id,target\n'
for id, pred in enumerate(preds):
  csv += '{},{}\n'.format(id, pred)
# with open('/content/drive/MyDrive/IFT6390/Kaggle-2/predictions_transformer_modify.csv', 'w') as f:
with open('/content/drive/MyDrive/Kaggle-2/predictions_transformer_4.csv', 'w') as f:
  f.writelines(csv)



# LSTM+CNN



In [None]:
# dataset = pd.read_csv('/content/drive/MyDrive/IFT6390/Kaggle-2/dataset.csv')
dataset = pd.read_csv('/content/drive/MyDrive/Kaggle-2/dataset.csv')

augmented_neutral_data = augment_neutral_class(dataset, 10000)
dataset = dataset.append(pd.DataFrame({'text': augmented_neutral_data, 'target': 1}), ignore_index=True)

vocab_size = 20000
maxlen = 374

# X = dataset['text'].values.tolist()
# tokenizer = tf.preprocessing.text.Tokenizer(num_words=vocab_size)
# tokenizer.fit_on_texts(X)
# sequences = tokenizer.texts_to_sequences(X)
# X = tf.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Y = tf.utils.to_categorical(dataset['target'].values.tolist(), 3)

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import models
from keras import layers

In [None]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    return ' '.join([word for word in text.split() if word not in stop_words])

def remove_mentions(text):
    return ' '.join([word for word in text.split() if not word.startswith('@')])

In [None]:
# View size of train new data
print(len(dataset))
print(dataset.shape)

1125596
(1125596, 2)


In [None]:
train_new=dataset

In [None]:
# Remove stopwords and mentions from text column in train_new
train_new['text'] = train_new['text'].apply(remove_stopwords).apply(remove_mentions)

In [None]:
# Use WordtoVec to convert text to vectors
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(train_new.text, train_new.target, test_size=0.1, random_state=37)
tk = Tokenizer(num_words=20000, lower=True, split=' ')
tk.fit_on_texts(X_train)
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [None]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=374)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=374)

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [None]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

In [None]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(10000, 8, input_length=374))
# emb_model.add(layers.Flatten())
emb_model.add(layers.LSTM(512, dropout=0.2))
emb_model.add(layers.Dense(3, activation='softmax'))
emb_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
emb_model.fit(X_train_emb, y_train_emb, epochs=10, batch_size=512, validation_data=(X_valid_emb, y_valid_emb))

Epoch 1/10

In [None]:
y_pred = emb_model.predict(X_valid_emb)



In [None]:
from sklearn.metrics import classification_report, confusion_matrix

## CNN Model


In [None]:
test = pd.read_csv('/content/drive/MyDrive/IFT6390/Dataset/test_final_fr.csv')

In [None]:
test.head()

In [None]:
max_len = 0
for i in dataset['text']:
  split_i = i.split()
  if len(split_i) > max_len:
    max_len = len(split_i)

for j in test['text']:
  split_j = j.split()
  if len(split_j) > max_len:
    max_len = len(split_j)
    
print('Max length of texts :', max_len)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [None]:
max_fatures = 300000 # the number of words to be used for the input of embedding layer
tokenizer = Tokenizer(num_words=max_fatures, split=' ') #Create the instance of Tokenizer
tokenizer.fit_on_texts(dataset['text'].values)
train_converted = tokenizer.texts_to_sequences(dataset['text'].values)
# test = tokenizer.texts_to_sequences(test['text'].values)
train_converted = pad_sequences(train_converted, maxlen=max_len) # Turning the vectors of train data into sequences 
#test = pad_sequences(test, maxlen=max_len) # Turning the vectors of test data into sequences 

In [None]:
test = tokenizer.texts_to_sequences(test['text'].values)
test = pad_sequences(test, maxlen=max_len)

In [None]:
target_converted = pd.get_dummies(dataset['target']).values

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
# Make sure that the shape of train and test data are same
X_train, X_test, Y_train, Y_test = train_test_split(train_converted, target_converted, test_size = 0.1, random_state = 42)

# Use half of the test data for validation during training
validation_size = 50000
# validation_size = 500
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

print('The shape of train data :', X_train.shape)
print('The shape of labels of train data :', Y_train.shape)
print('The shape of test data :', X_test.shape)
print('The shape of test label data :', Y_test.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SpatialDropout1D, Embedding, LSTM, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dropout
import matplotlib.pyplot as plt

In [None]:
# Parameters
NUM_FILTERS = 256 # Number of filters
NUM_WORDS = 4 # Number of the words to be convoluted
embed_dim = 512 # The size of the vector space where words will be embedded
batch_size = 512
EPOCHS = 2

# Create the CNN model
model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length = train_converted.shape[1]))
model.add(SpatialDropout1D(0.5))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation="softmax"))
model.compile(optimizer="adam", loss="categorical_crossentropy",metrics=["acc"])
print(model.summary()) # Show the summary of the model

history = model.fit(X_train, Y_train, batch_size=batch_size,
                    epochs=EPOCHS, validation_data=(X_validate, Y_validate))

In [None]:
train_acc = history.history['acc']
test_acc = history.history['val_acc']
x = np.arange(len(train_acc))
plt.plot(x, train_acc, label = 'train accuracy')
plt.plot(x, test_acc, label = 'test accuracy')
plt.title('Train and validation accuracy')
plt.xlabel('Number of epochs')
plt.ylabel('Accuracy')
plt.legend() 

In [None]:
score, acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("The loss of this model: %.2f" % (score))
print("The accuracy of this model: %.2f" % (acc))

In [None]:
# Let's predict whether the tweets are positive or negative using real test data set 
predictions = model.predict(test)
# Let's show the first 5 predictions as an samples
print('Prediction samples', predictions[:5])
# Show the shape of prediction, and the number of rows should be same to number of test data
print('The shape of predictions:', predictions.shape)



# Let's turn the prediction from [0, 1] and [1, 0] into 0(negative) and 4(positive)
# prediction_final = []
# for each_pediction in prediction_binary:
#   if each_pediction[0] == 1:
#     prediction_final.append(0)
#   else:
#     prediction_final.append(4)

In [None]:
predictions.dtype

In [None]:
sentiment = []
for i in range(len(predictions)):
    sentiment.append(np.argmax(predictions[i]))
# Show the first 5 sentiment values
print('Sentiment samples:', sentiment[:5])

In [None]:
submission = pd.DataFrame({'id': range(len(sentiment)), 'target': sentiment})

In [None]:
submission.head()

In [None]:
submission.to_csv('submission_CNN_final.csv', index=False)

## SVM

In [None]:
dataset.shape

(1540323, 2)

In [None]:
data = dataset[:200000]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re

In [None]:
def sentences_to_words(text):
    letters_only = re.sub("[^a-zA-Z]", " ",text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words ))

In [None]:
nltk.download('stopwords')
data['clean_text']=data['text'].apply(lambda x: sentences_to_words(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text']=data['text'].apply(lambda x: sentences_to_words(x))


In [None]:
x = data['clean_text']
y = data['target']

print(len(x),len(y))

200000 200000


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

150000 150000
50000 50000


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)


CountVectorizer()

In [None]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)


In [None]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)
vect_tunned

CountVectorizer(max_df=0.7, max_features=100, min_df=0.1, ngram_range=(1, 2),
                stop_words='english')

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear', random_state = 10)
model.fit(x_train_dtm, y_train)
#predicting output for test data
pred = model.predict(x_test_dtm)

In [None]:
pred

array([2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0,
       0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0,
       0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0,
       2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2,
       0, 2, 0, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0,
       2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0,
       0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2,
       0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2,
       0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0,
       2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2,

In [None]:
data = dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data['text'])

In [None]:
test_tf = tf.fit_transform(test['text'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, data['target'], test_size=0.3, random_state=123)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.8444092906900499


In [None]:
predicted

array([1, 2, 2, ..., 2, 2, 1])

In [None]:
test = pd.read_csv('/content/drive/MyDrive/IFT6390/Kaggle-2/test.csv')
#test = pd.read_csv('/content/drive/MyDrive/Kaggle-2/test.csv')
print(dataset.shape)

(1540323, 2)


In [None]:
predictions = clf.predict(test_tf)

In [None]:

preds = np.argmax(predictions, 1)
csv = 'id,target\n'
for id, pred in enumerate(preds):
  csv += '{},{}\n'.format(id, pred)
# with open('/content/drive/MyDrive/IFT6390/Kaggle-2/predictions_transformer_modify.csv', 'w') as f:
with open('/content/drive/MyDrive/Kaggle-2/predictions_transformer_4.csv', 'w') as f:
  f.writelines(csv)

# SVM

In [None]:
from sklearn import neighbors
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/IFT6390/Dataset/dataset_final_fr.csv')

In [None]:
data = dataset

In [None]:
data.head()

Unnamed: 0,id,text,target
0,0,anyway im geting of for a while,2
1,1,my red apache isn't felin to wel this morning htp,0
2,2,@user you should be its great friday wil be gr...,2
3,3,its pm and i dont wana slep so i debated with ...,2
4,4,why does twiter eat my dm's not hapy,0


In [None]:
# Example of string kernel
# compare poisition-wise 2 sequences(X & Y) and return similarity score.
def equal_elements(s1,s2):
    score = 0
    for i in range(len(s1)):
        score += (s1[i] == s2[i])*1 # This is an unoptimized way to do this. 
    return score

equal_elements("STRING","KERNEL")

1

In [None]:
clf = SVC(kernel=equal_elements)
clf.fit(data['text'],data['target']) # this producecs an error

In [None]:
data = data[['text','target']]

In [None]:
data.iloc[1,0]

"my red apache isn't felin to wel this morning htp"

In [None]:
size = 12
not_so_good_string_kernel = np.zeros((size, size))
for row in range(size):
    for column in range(size):
        not_so_good_string_kernel[row,column] = equal_elements(data.iloc[row, 0],data.iloc[column, 0])
not_so_good_string_kernel

In [None]:
def compose_kernel(row_idxs, col_idxs):
    row_idxs = np.array(row_idxs).astype(np.int)
    col_idxs = np.array(col_idxs).astype(np.int)
    select_kernel = np.zeros((len(row_idxs),len(col_idxs)))
    for i, row_idx in enumerate(row_idxs):
        for j, col_idx in enumerate(col_idxs):
            select_kernel[i,j] = not_so_good_string_kernel[row_idx,col_idx]  # Change to custom distance kernel
    
    return select_kernel

compose_kernel([5,2,3,1],[5,2,3,1]) # random example

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  row_idxs = np.array(row_idxs).astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  col_idxs = np.array(col_idxs).astype(np.int)


array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [None]:
y = data['target'].values
X_train_idx, X_test_idx, y_train, y_test = train_test_split(np.arange(size),y[:size], test_size=4) # OR USE KFoldStratified()
X_train_idx, X_test_idx, y_train, y_test