In [1]:
import pandas as pd
import re

In [3]:
training1 = pd.read_csv("../input/funniness/Training_1.csv")
training2 = pd.read_csv("../input/funniness/Training_2.csv")
training = training1.append(training2)

In [4]:
testing = pd.read_csv("../input/funniness/Testing.csv")

In [5]:
training.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,100000,Donald Trump <skips/> G7 talks on climate cris...,misunderstands,22110,1.2
1,100001,Donald Trump <skips/> G7 talks on climate cris...,praises,33100,1.4
2,100002,The queen agrees to <suspend/> parliament,dismantle,31110,1.2
3,100003,<Apple/> reverses stance on iPhone repairs an...,Microsoft,20000,0.4
4,100004,Apple reverses stance on iPhone repairs and wi...,death,22211,1.6


In [6]:
training['num_start'] = training['original'].str.find("<")
training['num_end'] = training['original'].str.find(">")
training['headline'] = training.apply(lambda x: x['original'][:x['num_start']] + x['edit'] + x['original'][x['num_end'] + 1:], axis=1)
training = training.drop(columns=['num_start', 'num_end'])
training.head()

Unnamed: 0,id,original,edit,grades,meanGrade,headline
0,100000,Donald Trump <skips/> G7 talks on climate cris...,misunderstands,22110,1.2,Donald Trump misunderstands G7 talks on climat...
1,100001,Donald Trump <skips/> G7 talks on climate cris...,praises,33100,1.4,Donald Trump praises G7 talks on climate crisi...
2,100002,The queen agrees to <suspend/> parliament,dismantle,31110,1.2,The queen agrees to dismantle parliament
3,100003,<Apple/> reverses stance on iPhone repairs an...,Microsoft,20000,0.4,Microsoft reverses stance on iPhone repairs a...
4,100004,Apple reverses stance on iPhone repairs and wi...,death,22211,1.6,Apple reverses stance on iPhone repairs and wi...


In [7]:
training['size'] = training['headline'].str.len()
training['size'].describe()

count    17900.000000
mean        71.242682
std         18.070802
min         17.000000
25%         59.000000
50%         70.000000
75%         82.000000
max        151.000000
Name: size, dtype: float64

In [8]:
training['funny'] = [True if (grade >= 1.5) else False for grade in training['meanGrade']]
print(training[['meanGrade', 'funny']].head())

   meanGrade  funny
0        1.2  False
1        1.4  False
2        1.2  False
3        0.4  False
4        1.6   True


In [9]:
funny_label = training.funny.factorize()

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout,SpatialDropout1D, Embedding

In [11]:
headline = training.headline.values
tokenizer = Tokenizer(num_words=150)
tokenizer.fit_on_texts(headline)

vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(headline)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [14]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           531104    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 547,755
Trainable params: 547,755
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
train = model.fit(padded_sequence,funny_label[0],validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
testing['num_start'] = testing['original'].str.find("<")
testing['num_end'] = testing['original'].str.find(">")
testing['headline'] = testing.apply(lambda x: x['original'][:x['num_start']] + x['edit'] + x['original'][x['num_end'] + 1:], axis=1)
testing = testing.drop(columns=['num_start', 'num_end'])
testing.head()

Unnamed: 0,id,original,edit,grades,meanGrade,headline
0,36,The Latest : Election tally shows <Austria/> t...,Cars,31110,1.2,The Latest : Election tally shows Cars turning...
1,2157,House Intel <Republicans/> Have Cleared Trump ...,onions,11000,0.4,House Intel onions Have Cleared Trump . So Are...
2,9385,Christmas Is Canceled : Nazareth ’s Muslim <Ma...,grump,11111,1.0,Christmas Is Canceled : Nazareth ’s Muslim gru...
3,14434,White House says Trump 's legal <team/> suppor...,bozos,32210,1.6,White House says Trump 's legal bozos supports...
4,9462,Election <analysts/> move Ryan seat toward Dem...,movers,11000,0.4,Election movers move Ryan seat toward Dems aft...


In [19]:
testing['size'] = testing['headline'].str.len()
testing['size'].describe()

count    3024.000000
mean       70.207341
std        18.614008
min        20.000000
25%        57.000000
50%        68.000000
75%        81.000000
max       142.000000
Name: size, dtype: float64

In [20]:
testing['funny'] = [True if (grade >= 1.5) else False for grade in testing['meanGrade']]
print(testing[['meanGrade', 'funny']].head())

   meanGrade  funny
0        1.2  False
1        0.4  False
2        1.0  False
3        1.6   True
4        0.4  False


In [21]:
test_label = testing.funny.factorize()

In [22]:
headline = testing.headline.values
tokenizer = Tokenizer(num_words=150)
tokenizer.fit_on_texts(headline)

vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(headline)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [23]:
results = model.evaluate(padded_sequence, test_label[0], batch_size=32)
print("test loss, test acc:", results)

test loss, test acc: [0.49738845229148865, 0.8171296119689941]
