In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

train.head

<bound method NDFrame.head of          id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...  

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
VOCAB_SIZE = 5000

vectorizer = CountVectorizer(stop_words=stop_words)
words = vectorizer.fit_transform(train['text'])

word_count_df = pd.DataFrame({'word': list(vectorizer.vocabulary_.keys()), 'count': np.asarray(words.sum(axis=0)).ravel()})
word_count_df = word_count_df.sort_values('count', ascending=False).head(VOCAB_SIZE)

def transform(df):
    X_transformed = vectorizer.transform(df['text'])
    X_transformed = X_transformed[:, word_count_df.index]
    X_transformed = X_transformed.toarray()
    return X_transformed

X_train = transform(train)
X_test = transform(test)
y_train = np.array(train['target'])

print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]]


In [3]:
import torch

X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train)
X_test = torch.Tensor(X_test)

print(X_train.size())

torch.Size([7613, 5000])


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(VOCAB_SIZE, 100)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(100, 50)
        self.linear3 = nn.Linear(50, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear1(x)
        x = self.sigmoid(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.sigmoid(x)
        return x

net = Net()
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001)

for epoch in range(1000):
    optimizer.zero_grad()
    outputs = net(torch.Tensor(X_train))
    loss = criterion(outputs, torch.Tensor(y_train).view(-1, 1))
    loss.backward()
    optimizer.step()

In [5]:
def predict(X_test):
    with torch.no_grad():
        outputs = net(torch.Tensor(X_test))
        print(outputs)
        predicted = (outputs > 0.5).int()
    
    return predicted

submission = pd.read_csv('./sample_submission.csv')
submission['target'] = predict(X_test)
print(submission)
submission.to_csv('submission.csv', index=False)

tensor([[0.4386],
        [0.4386],
        [0.4390],
        ...,
        [0.4390],
        [0.4389],
        [0.4388]])
         id  target
0         0       0
1         2       0
2         3       0
3         9       0
4        11       0
...     ...     ...
3258  10861       0
3259  10865       0
3260  10868       0
3261  10874       0
3262  10875       0

[3263 rows x 2 columns]
