In [1]:
import numpy as np
import pandas as pd
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re as re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


import tensorflow as tf
from tensorflow import keras
from numpy import loadtxt
from tensorflow.keras.layers import Dense, Dropout, Flatten

import tensorflow.keras.backend as K

In [2]:
#Cargamos los datos

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
#Corregimos algunos nulos del set de train

train_data['keyword'].fillna('no keyword', inplace = True) 
train_data['keyword'] = train_data['keyword'].str.replace('%20', ' ')
train_data['location'].fillna('no location', inplace = True)
train_data.drop(['id'],1,inplace = True)

In [4]:
#Corregimos algunos nulos del set de test
test_data['keyword'].fillna('no keyword', inplace = True)
test_data['keyword'] = train_data['keyword'].str.replace('%20', ' ')
test_data['location'].fillna('no location', inplace = True)

In [5]:
#FUNCIONES UTILES

def only_letters(tweet):
    tweet = re.sub(r'http\S*', '', tweet)
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    return tweet

stop_words=set(stopwords.words("english"))

def filter_stopwords(tokenized_text):
    filtered_words=[]
    for w in tokenized_text:
        if w not in stop_words:
            filtered_words.append(w)
    return filtered_words

lemmatizer = WordNetLemmatizer()

def lemmatize_tweet(tweet):
    lemmatized_words = []
    for word in tweet:
        lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

def transform_to_text(tweet_words):
    return " ".join(tweet_words)

custom_sent_tokenizer = PunktSentenceTokenizer()

def process_content(sentence):
    tokenized = custom_sent_tokenizer.tokenize(sentence)
    words_tagged = []
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        for word in tagged:
            words_tagged.append(word)
            
    return words_tagged

#Cleaning text

train_data['clean_text'] = train_data['text'].str.lower()
test_data['clean_text'] = test_data['text'].str.lower()

train_data['clean_text'] = train_data['clean_text'].apply(only_letters)    
test_data['clean_text'] = test_data['clean_text'].apply(only_letters)   

#Tokenización

train_data['clean_text'] = train_data['clean_text'].apply(word_tokenize)
test_data['clean_text'] = test_data['clean_text'].apply(word_tokenize)

#Remove stopwords

train_data['clean_text'] = train_data['clean_text'].apply(filter_stopwords) 
test_data['clean_text'] = test_data['clean_text'].apply(filter_stopwords)

#Lemmatization                                                                       

train_data['clean_text'] = train_data['clean_text'].apply(lemmatize_tweet)
test_data['clean_text'] = test_data['clean_text'].apply(lemmatize_tweet)

#Transform to text
train_data['clean_text'] = train_data['clean_text'].apply(transform_to_text)
test_data['clean_text'] = test_data['clean_text'].apply(transform_to_text)

#Part of speech tagging

train_data['tagged_text']= train_data['text'].apply(process_content)
train_data['tagged_clean_text']= train_data['clean_text'].apply(process_content)

test_data['tagged_text']= test_data['text'].apply(process_content)
test_data['tagged_clean_text']= test_data['clean_text'].apply(process_content)

In [6]:
def count_Nouns(list):
    nouns = 0
    for word,tag in list:
        if tag[0] == 'N':
            nouns += 1
        else:
            continue
    return nouns

def count_Adjetives(list):
    adjetives = 0
    for word,tag in list:
        if tag[0] == 'J':
            adjetives += 1
        else:
            continue
    return adjetives

def count_Verbs(list):
    verbs = 0
    for word,tag in list:
        if tag[0] == 'V':
            verbs += 1
        else:
            continue
    return verbs 


# word_count
train_data['word_count'] = train_data['text'].apply(lambda x: len(str(x).split()))
test_data['word_count'] = test_data['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
train_data['unique_word_count'] = train_data['text'].apply(lambda x: len(set(str(x).split())))
test_data['unique_word_count'] = test_data['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
train_data['stop_word_count'] = train_data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
test_data['stop_word_count'] = test_data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

# url_count
train_data['url_count'] = train_data['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
test_data['url_count'] = test_data['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
train_data['mean_word_length'] = train_data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_data['mean_word_length'] = test_data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# length
train_data['tweet_length'] = train_data['text'].apply(lambda x: len(str(x)))
test_data['tweet_length'] = test_data['text'].apply(lambda x: len(str(x)))

# punctuation_count
train_data['punctuation_count'] = train_data['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_data['punctuation_count'] = test_data['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
train_data['hashtag_count'] = train_data['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
test_data['hashtag_count'] = test_data['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
train_data['mention_count'] = train_data['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
test_data['mention_count'] = test_data['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

#noun_count
train_data['noun_count'] = train_data['tagged_text'].apply(count_Nouns)
test_data['noun_count'] = test_data['tagged_text'].apply(count_Nouns)

#verb_count
train_data['verb_count'] = train_data['tagged_text'].apply(count_Verbs)
test_data['verb_count'] = test_data['tagged_text'].apply(count_Verbs)

#adjetives_count
train_data['adjetives_count'] = train_data['tagged_text'].apply(count_Adjetives)
test_data['adjetives_count'] = test_data['tagged_text'].apply(count_Adjetives)

In [7]:
train_data.head()

Unnamed: 0,keyword,location,text,target,clean_text,tagged_text,tagged_clean_text,word_count,unique_word_count,stop_word_count,url_count,mean_word_length,tweet_length,punctuation_count,hashtag_count,mention_count,noun_count,verb_count,adjetives_count
0,no keyword,no location,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u,"[(Our, PRP$), (Deeds, NNS), (are, VBP), (the, ...","[(deed, NN), (reason, NN), (earthquake, NN), (...",13,13,6,0,4.384615,69,1,1,0,6,1,0
1,no keyword,no location,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[(Forest, NNP), (fire, NN), (near, IN), (La, N...","[(forest, JJS), (fire, NN), (near, IN), (la, J...",7,7,0,0,4.571429,38,1,0,0,6,0,0
2,no keyword,no location,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...,"[(All, DT), (residents, NNS), (asked, VBD), (t...","[(resident, NN), (asked, VBD), (shelter, JJ), ...",22,20,11,0,5.090909,133,3,0,0,7,7,1
3,no keyword,no location,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,"[(13,000, CD), (people, NNS), (receive, JJ), (...","[(people, NNS), (receive, VBP), (wildfire, NN)...",8,8,1,0,7.125,65,2,1,0,4,1,1
4,no keyword,no location,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...,"[(Just, RB), (got, VBN), (sent, VBD), (this, D...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...",16,15,7,0,4.5,88,2,2,0,6,3,0


In [8]:
test_data.head()

Unnamed: 0,id,keyword,location,text,clean_text,tagged_text,tagged_clean_text,word_count,unique_word_count,stop_word_count,url_count,mean_word_length,tweet_length,punctuation_count,hashtag_count,mention_count,noun_count,verb_count,adjetives_count
0,0,no keyword,no location,Just happened a terrible car crash,happened terrible car crash,"[(Just, RB), (happened, VBD), (a, DT), (terrib...","[(happened, VBN), (terrible, JJ), (car, NN), (...",6,6,2,0,4.833333,34,0,0,0,2,1,1
1,2,no keyword,no location,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...,"[(Heard, NNP), (about, IN), (#, #), (earthquak...","[(heard, RB), (earthquake, NN), (different, JJ...",9,9,2,0,6.222222,64,3,1,0,4,2,2
2,3,no keyword,no location,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...,"[(there, EX), (is, VBZ), (a, DT), (forest, JJ)...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...",19,19,9,0,4.105263,96,2,0,0,4,4,2
3,9,no keyword,no location,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,"[(Apocalypse, NNP), (lighting, NN), (., .), (#...","[(apocalypse, NN), (lighting, VBG), (spokane, ...",4,4,0,0,9.25,40,3,2,0,4,0,0
4,11,no keyword,no location,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan,"[(Typhoon, NNP), (Soudelor, NNP), (kills, VBZ)...","[(typhoon, NN), (soudelor, NN), (kill, VB), (c...",8,8,2,0,4.75,45,0,0,0,4,1,0


## TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range= (1,2))

#Corremos el algoritmo para TRAIN set 
texts = train_data['clean_text']
features = tf_idf.fit_transform(texts)
feature_words = tf_idf.get_feature_names()
df_tf_idf = pd.DataFrame(data = features.todense(), columns = tf_idf.get_feature_names())
df_tf_idf.shape

df_tf_idf["_target_"] = train_data.target
df_tf_idf = df_tf_idf.reset_index()
df_tf_idf.head()

Unnamed: 0,index,aa,ab,aba,aba woman,abandon,abandoned,abandoned aircraft,abbott,abbswinston,...,zipper,zipper bag,zombie,zombie apocalypse,zone,zone coming,zone dont,zouma,zouma flattened,_target_
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
#Corremos el algoritmo para TEST set 
texts_test = test_data['clean_text']

features_test = tf_idf.transform(texts_test)

feature_words_test = tf_idf.get_feature_names()

df_tf_idf_test = pd.DataFrame(data = features_test.todense(), columns = tf_idf.get_feature_names())
df_tf_idf_test.shape

df_tf_idf_test = df_tf_idf_test.reset_index()
df_tf_idf_test.head()

Unnamed: 0,index,aa,ab,aba,aba woman,abandon,abandoned,abandoned aircraft,abbott,abbswinston,...,zippednews,zipper,zipper bag,zombie,zombie apocalypse,zone,zone coming,zone dont,zouma,zouma flattened
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#Combinamos TFIDF con los features obtenidos (TRAIN SET)
train_data = train_data.reset_index()
train_data = train_data.merge(df_tf_idf, how='inner',on='index')
train_data.head()


Unnamed: 0,index,keyword,location_x,text_x,target_x,clean_text,tagged_text,tagged_clean_text,word_count,unique_word_count,...,zipper,zipper bag,zombie,zombie apocalypse,zone,zone coming,zone dont,zouma,zouma flattened,_target_
0,0,no keyword,no location,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u,"[(Our, PRP$), (Deeds, NNS), (are, VBP), (the, ...","[(deed, NN), (reason, NN), (earthquake, NN), (...",13,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,no keyword,no location,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[(Forest, NNP), (fire, NN), (near, IN), (La, N...","[(forest, JJS), (fire, NN), (near, IN), (la, J...",7,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,no keyword,no location,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...,"[(All, DT), (residents, NNS), (asked, VBD), (t...","[(resident, NN), (asked, VBD), (shelter, JJ), ...",22,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,3,no keyword,no location,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,"[(13,000, CD), (people, NNS), (receive, JJ), (...","[(people, NNS), (receive, VBP), (wildfire, NN)...",8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,4,no keyword,no location,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...,"[(Just, RB), (got, VBN), (sent, VBD), (this, D...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N...",16,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [12]:
#Combinamos TFIDF con los features obtenidos (TEST SET)
test_data = test_data.reset_index()  
test_data = test_data.merge(df_tf_idf_test, how='inner',on='index')
test_data.head()

Unnamed: 0,index,id_x,keyword,location_x,text_x,clean_text,tagged_text,tagged_clean_text,word_count,unique_word_count,...,zippednews,zipper,zipper bag,zombie,zombie apocalypse,zone,zone coming,zone dont,zouma,zouma flattened
0,0,0,no keyword,no location,Just happened a terrible car crash,happened terrible car crash,"[(Just, RB), (happened, VBD), (a, DT), (terrib...","[(happened, VBN), (terrible, JJ), (car, NN), (...",6,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,no keyword,no location,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...,"[(Heard, NNP), (about, IN), (#, #), (earthquak...","[(heard, RB), (earthquake, NN), (different, JJ...",9,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3,no keyword,no location,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...,"[(there, EX), (is, VBZ), (a, DT), (forest, JJ)...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...",19,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,9,no keyword,no location,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,"[(Apocalypse, NNP), (lighting, NN), (., .), (#...","[(apocalypse, NN), (lighting, VBG), (spokane, ...",4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,11,no keyword,no location,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan,"[(Typhoon, NNP), (Soudelor, NNP), (kills, VBZ)...","[(typhoon, NN), (soudelor, NN), (kill, VB), (c...",8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#Construimos los datos de entrenamiento y de test
scaler = StandardScaler()

X = train_data.drop(["_target_",'tagged_text','tagged_clean_text','index','clean_text','text_x','target_x','keyword','location_x'], axis=1)
X = X.values
y = train_data["_target_"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Preparamos los datos de Test
X_TEST = test_data.drop(['id_x','tagged_text','tagged_clean_text','index','clean_text','text_x','keyword','location_x'], axis=1)
X_TEST = scaler.fit_transform(X_TEST)

## Modelo

In [None]:
import xgboost as xgb

#Luego de consultar con mi equipo de los random search realizados, declaro el modelo con los hiperparametros siguientes
clf = xgb.XGBClassifier(max_depth=200, n_estimators=400, subsample=1, learning_rate=0.07, reg_lambda=0.1, reg_alpha=0.1,\
                       gamma=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_Test)

In [None]:
from sklearn.metrics import f1_score

f1score = f1_score(y_test, predictions)
print(f'Counts model score: {f1score*100}%')

In [None]:
#Preparamos el submit

predictions = clf.predict(X_TEST)

test_data['id'] = test_data['id_x']
test_data['target'] = predictions.astype(int)
submission = test_data.loc[:,['id','target']]
submission.to_csv(path_or_buf='submissionTFIDF-xgb.csv',header=True,index=False)
len(submission)


In [None]:
#Observamos la distribución de las predicciones 
submission['target'].value_counts()