In [None]:


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import re
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Any results you write to the current directory are saved as output.
from sklearn.ensemble import  RandomForestClassifier as RFC
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from sklearn.metrics import f1_score,accuracy_score ,roc_auc_score
from nltk.stem.snowball import SnowballStemmer
import xgboost as xgb
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LR

In [None]:
df = pd.read_csv("/kaggle/input/sarcasm/tweets.csv")
df.head()

In [None]:
df= df.dropna()
df = df[df.tweet.apply(lambda x: x !="")]

In [None]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [None]:
df["tweet"] = df["tweet"].map(lambda x: clean_text(x))

In [None]:
### Create sequence
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df["tweet"])
sequences = tokenizer.texts_to_sequences(df["tweet"])
data = pad_sequences(sequences, maxlen=50)

**Extract word embeddings from the Glove**

In [None]:
embeddings_index = dict()
f = open('/kaggle/input/glove/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

**Create a weight matrix**

In [None]:
embedding_matrix = np.zeros((vocabulary_size, 50))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
import tensorflow as tf
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

In [None]:
import keras
config = tf.ConfigProto( device_count = {'GPU': 1 } ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [None]:
labels = df["Sarcasm"]
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 50, input_length=50, weights=[embedding_matrix], trainable=True))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )
## Fit train data
model_glove.fit(data, np.array(labels), validation_split=0.2, epochs = 10 )

In [None]:
NN = model_glove

In [None]:
X_train, X_test, y_train, y_test = tts(data, labels, test_size=0.2,  random_state=13)

In [None]:
model = xgb.XGBClassifier(n_jobs=-1 , n_estimatore =200)
model.fit(X_train,y_train)

In [None]:
rfc = RFC(n_jobs= -1)
rfc.fit(X_train, y_train)

In [None]:
lr = LR()
lr.fit(X_train,y_train)

In [None]:
models = [NN ,rfc, lr,model] 
f1score =[]
auc_score = []
accuracy = []
for i in models :
    pred = np.round(i.predict(X_test)).astype(int)
    f1score.append(f1_score(pred,y_test))
    auc_score.append(roc_auc_score(pred,y_test))
    accuracy.append(accuracy_score(pred,y_test))
    
        

In [None]:
results = pd.DataFrame()
results["MODEL"] =["Neaural Net with LSTM" ,"Random Forest Classifier" , "Logistic Regression" ,"XGBOOST Classifier"]
results["F1_Score"] =f1score
results["Auc_Score"] =auc_score
results["Accuracy %"]=accuracy

In [None]:
results