In [1]:
import pandas as pd
df=pd.read_csv('data.csv')
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [2]:
df_positive = df[df['label'] == 'good']
  
print(df_positive)

                                                      url label
42767                                          01453.com/  good
42768               015fb31.netsolhost.com/bosstweed.html  good
42769             02bee66.netsolhost.com/lincolnhomepage/  good
42770   02ec0a3.netsolhost.com/getperson.php?personID=...  good
42771                                         032255.com/  good
...                                                   ...   ...
387583  youthleaguesusa.com/potomacsoccer/2011/Tournam...  good
387584                                     zip-codes.com/  good
387585                    owens.edu/news-releases/?p=2052  good
387586      1.safesecureweb.com/egale/index.asp?item=1173  good
387587               yurika.otakuthon.com/reg/main.pl/en/  good

[344821 rows x 2 columns]


In [3]:
df_negative = df[df['label'] == 'bad']
  
print(df_negative)

                           url label
0       diaryofagameaddict.com   bad
1             espdesign.com.au   bad
2           iamagameaddict.com   bad
3                kalantzis.net   bad
4        slightlyoffcenter.net   bad
...                        ...   ...
420459         23.227.196.215/   bad
420460      apple-checker.org/   bad
420461       apple-iclods.org/   bad
420462      apple-uptoday.org/   bad
420463       apple-search.info   bad

[75643 rows x 2 columns]


In [4]:
df_positive.to_csv('positive.csv')
df_negative.to_csv('negative.csv')

In [3]:
import numpy as np
import re
import itertools
from collections import Counter


def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("positive.csv", "r", encoding='latin-1').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("negative.csv", "r", encoding='latin-1').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]


def load_data():
    """
    Loads and preprocessed data for the dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]

In [4]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split

print('Loading data')
x, y, vocabulary, vocabulary_inv = load_data()


Loading data
[[    43 752806     43 ... 489858 489858 489858]
 [403329     43   1371 ... 489858 489858 489858]
 [403331     43   1411 ... 489858 489858 489858]
 ...
 [402235     43 503104 ... 489858 489858 489858]
 [402236     43 503104 ... 489858 489858 489858]
 [402237     43 503104 ... 489858 489858 489858]] [[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [7]:
print(len(vocabulary))

776287


In [25]:

try:
    geeky_file = open('vocabulary.txt', 'wt')
    geeky_file.write(str(vocabulary))
    geeky_file.close()

except:
    print("Unable to write to file")

In [28]:
with open('vocabulary_inv.txt', 'w+') as f:
    for items in vocabulary_inv:
        f.write('%s\n' %items)
    print("File written successfully")

f.close()

File written successfully


In [8]:
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)


In [7]:
print(X_train)
print(y_train)

[[224452     43 619046 ... 489858 489858 489858]
 [ 71985     43 736306 ... 489858 489858 489858]
 [302041     43 718341 ... 489858 489858 489858]
 ...
 [103000     43 497321 ... 489858 489858 489858]
 [121144     43 507341 ... 489858 489858 489858]
 [ 90948     43 766226 ... 489858 489858 489858]]
[[0 1]
 [0 1]
 [0 1]
 ...
 [0 1]
 [0 1]
 [0 1]]


In [9]:
sequence_length = x.shape[1] 
vocabulary_size = len(vocabulary_inv) 
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 1
batch_size = 30

In [16]:
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax')(dropout)

# this creates a model 
model = Model(inputs=inputs, outputs=output)

#checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
print("Traning Model...")
history=model.fit(X_train[:16818], y_train[:16818], batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test[:4204], y_test[:4204]))  # starts training


Creating Model...
Traning Model...


In [23]:
model.save('word-cnn')

INFO:tensorflow:Assets written to: word-cnn\assets


In [None]:
#reconstructed_model = keras.models.load_model("my_model")

In [49]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

X_sample = 'pos-kupang.com/'
X_sample=clean_str(X_sample)
tk = Tokenizer(num_words=True, oov_token='UNK')
tk.fit_on_texts(X_sample)
X_sample = tk.texts_to_sequences(X_sample)
X_sample = pad_sequences(X_sample, padding='post', maxlen=182)
y_sample = model.predict(X_sample).flatten().tolist()

print('Prediction: ',y_sample)

Prediction:  [0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.3802046775817871, 0.6197953820228577, 0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.36733871698379517, 0.6326612830162048, 0.3802046775817871, 0.6197953820228577, 0.36733871698379517, 0.6326612830162048, 0.3673386871814728, 0.6326612830162048, 0.3673386871814728, 0.6326612830162048]


In [51]:
ans=np.argmax(y_sample)
print(ans)

1
