In [1]:
import pandas as pd

In [4]:
train_frame = pd.read_csv('../train.csv')
test_frame = pd.read_csv('../test.csv')

In [5]:
train_frame = train_frame.sample(frac=0.1, random_state=123)
train_frame.shape

(130612, 3)

In [9]:
train_frame['target'].value_counts(normalize=True)

target
0    0.939117
1    0.060883
Name: proportion, dtype: float64

## Prepare data for training
1. Convert to TF-IDF vectors
2. Convert vectors to PyTorch tensors
3. Create PyTorch dataloaders

### Convert to TF-IDF vectors 

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

w = tf * log(N/df) <br>
w = frequency of x in doc y <br>
N = total doc <br>
df = total doc where x is present


In [34]:
stemmer = SnowballStemmer(language='english')
english_stopword = stopwords.words('english')

In [31]:
def tokenize(text):
    return [stemmer.stem(word) for word in word_tokenize(text)]

['go', ',', 'gone', ',', 'goe']

In [35]:
vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    stop_words=english_stopword,
    max_features=1000,

)

In [36]:
%%time
vectorizer.fit(train_frame.question_text)



CPU times: user 13.8 s, sys: 165 ms, total: 14 s
Wall time: 14.1 s


In [38]:
vectorizer.get_feature_names_out()[:100]

array(['!', '$', '%', '&', "'", "''", "'m", "'s", '(', ')', ',', '-', '.',
       '1', '10', '100', '12', '12th', '15', '2', '20', '2017', '2018',
       '3', '4', '5', '6', '7', '8', ':', '?', '[', ']', '``', 'abl',
       'abus', 'accept', 'accomplish', 'accord', 'account', 'achiev',
       'act', 'action', 'activ', 'actor', 'actual', 'ad', 'add',
       'address', 'admiss', 'adult', 'advanc', 'advantag', 'advic',
       'affect', 'africa', 'african', 'age', 'ago', 'air', 'alcohol',
       'allow', 'alon', 'along', 'alreadi', 'also', 'altern', 'alway',
       'amazon', 'america', 'american', 'among', 'amount', 'analysi',
       'android', 'ani', 'anim', 'anoth', 'answer', 'anxieti', 'anyon',
       'anyth', 'apart', 'app', 'appear', 'appl', 'appli', 'applic',
       'approach', 'arab', 'area', 'armi', 'around', 'art', 'asian',
       'ask', 'atheist', 'attack', 'attend', 'attract'], dtype=object)

In [39]:
%%time
inputs = vectorizer.transform(train_frame.question_text)

CPU times: user 13.9 s, sys: 123 ms, total: 14 s
Wall time: 14 s


In [40]:
%%time
test_input = vectorizer.transform(test_frame.question_text)

CPU times: user 39.7 s, sys: 264 ms, total: 40 s
Wall time: 40 s


In [42]:
print(inputs.shape)
print(test_input.shape)

(130612, 1000)
(375806, 1000)


#### Spliting data

In [44]:
from sklearn.model_selection import train_test_split
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs,
    train_frame.target,
    random_state=124,
    test_size=0.3
)

In [45]:
print(train_inputs.shape)
print(train_targets.shape)

(91428, 1000)
(91428,)


 ### Convert to pytorch tensors

In [46]:
import torch


In [58]:
train_inputs_tensors = torch.tensor(train_inputs.toarray()).float()
val_inputs_tensors = torch.tensor(val_inputs.toarray()).float()
train_inputs_tensors.shape


torch.Size([91428, 1000])

In [60]:
train_target_tensors = torch.tensor(train_targets.values)
val_target_tensors = torch.tensor(val_targets.values)
print(train_target_tensors.shape)

torch.Size([91428])


In [68]:
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_inputs_tensors,train_target_tensors)
val_dataset = TensorDataset(val_inputs_tensors,val_target_tensors)

In [69]:
train_dataloader = DataLoader(train_dataset,batch_size=128,shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=128)

#### NN network 

In [None]:
class classifier(torch.nn):
    def __init__(self):
        self.layer = True