In [147]:
from scipy.stats import uniform, randint
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

from lightgbm import LGBMClassifier

In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict

In [149]:
tweets = pd.read_csv('train.csv')
tweets_test = pd.read_csv('test.csv')

tweets['len_text']=tweets['text'].str.len()
tweets_test['len_text']=tweets_test['text'].str.len()

tweets['keyword'] = tweets['keyword'].str.replace('%20', ' ')
tweets['keyword'].fillna('no keyword', inplace = True)

tweets_test['keyword'] = tweets_test['keyword'].str.replace('%20', ' ')
tweets_test['keyword'].fillna('no keyword', inplace = True)

tweets = tweets.sample(frac=1,random_state=1)

In [150]:
tweets['clean_text'] = tweets['text'].str.lower()
tweets_test['clean_text'] = tweets_test['text'].str.lower()

In [151]:
def only_letters(tweet):
    tweet = re.sub(r'http\S*', '', tweet)
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    return tweet

In [152]:
tweets['clean_text'] = tweets['clean_text'].apply(only_letters)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(only_letters)

In [153]:
#Tokenización
from nltk.tokenize import word_tokenize
tweets['clean_text'] = tweets['clean_text'].apply(word_tokenize)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(word_tokenize)
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [154]:
def filter_stopwords(tokenized_text):
    not_stopwords=[]
    for w in tokenized_text:
        if w not in stop_words:
            not_stopwords.append(w)
    return not_stopwords

In [155]:
tweets['clean_text'] = tweets['clean_text'].apply(filter_stopwords)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(filter_stopwords)

In [156]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [157]:
def lemmatize_tweet(tweet):
    lemmatized_words = []
    for word in tweet:
        lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

In [158]:
tweets['clean_text'] = tweets['clean_text'].apply(lemmatize_tweet)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lemmatize_tweet)
tweets['clean_text'] = tweets['clean_text'].apply(lambda text:' '.join(text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text:' '.join(text))
tweets['clean_text'] = tweets['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))

In [159]:
#Nuevos features

In [160]:
#Hashtags del tweet

In [161]:
def get_hashtags(s):
    return list(part[1:] for part in s.split() if part.startswith('#'))

In [162]:
tweets['hashtags'] = tweets['text'].apply(get_hashtags)
tweets_test['hashtags'] = tweets_test['text'].apply(get_hashtags)

In [163]:
#Proporción de la longitud del hashtag con respecto a la del texto

In [164]:
def hashtag_length_proportion(hashtags,length):
    return len(''.join(hashtags))/length

In [165]:
tweets['len_hashtag_over_text'] = tweets.apply(lambda data: hashtag_length_proportion(data['hashtags'],data['len_text']),axis=1)
tweets_test['len_hashtag_over_text'] = tweets_test.apply(lambda data: hashtag_length_proportion(data['hashtags'],data['len_text']),axis=1)

In [166]:
#Cantidad de hashtags del tweet

In [167]:
def hashtags_count(l):
    return len(l)

In [168]:
tweets['hashtags_count']=tweets['hashtags'].apply(hashtags_count)
tweets_test['hashtags_count']=tweets_test['hashtags'].apply(hashtags_count)

In [169]:
#Usuarios mencionados en el tweet

In [170]:
def get_mentioned_users(s):
    return list(part[1:] for part in s.split() if part.startswith('@'))

In [171]:
tweets['users'] = tweets['text'].apply(get_mentioned_users)
tweets_test['users'] = tweets_test['text'].apply(get_mentioned_users)

In [172]:
def users_count(l):
    return len(l)

In [173]:
tweets['users_count']=tweets['users'].apply(users_count)
tweets_test['users_count']=tweets_test['users'].apply(users_count)

In [174]:
#urls en el texto

In [175]:
def find_url(text):
    urls = re.findall(r'(https?://\S+)', text)
    return urls

In [176]:
tweets['urls']=tweets['text'].apply(find_url)
tweets_test['urls']=tweets_test['text'].apply(find_url)

In [177]:
def urls_count(l):
    return len(l)

In [178]:
tweets['urls_count'] = tweets['urls'].apply(lambda x: len(x))
tweets_test['urls_count'] = tweets_test['urls'].apply(lambda x: len(x))

In [179]:
def has_url(text):
    return int('http' in text)

In [180]:
tweets['has_url']=tweets['text'].apply(has_url)
tweets_test['has_url']=tweets_test['text'].apply(has_url)

In [181]:
#Mean encoding keyword
tweets['keyword_encoded'] = tweets.groupby('keyword')['target'].transform('mean')

In [182]:
keywords_dict = pd.Series(tweets['keyword_encoded'].values, index=tweets['keyword']).to_dict()

In [183]:
tweets_test['keyword_encoded']= tweets_test['keyword'].map(keywords_dict)

In [184]:
tweets['len_clean_text'] = tweets['clean_text'].str.len()
tweets_test['len_clean_text'] = tweets_test['clean_text'].str.len()

#Proporción de longitud de clean_text con respecto texto original
tweets['len_clean_text_over_text'] = tweets['len_clean_text']/tweets['len_text']
tweets_test['len_clean_text_over_text'] = tweets_test['len_clean_text']/tweets_test['len_text']

In [185]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target,len_text,clean_text,hashtags,len_hashtag_over_text,hashtags_count,users,users_count,urls,urls_count,has_url,keyword_encoded,len_clean_text,len_clean_text_over_text
3228,4632,emergency services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1,141,goulburn man henry van bilsen missing emergenc...,[],0.0,0,[],0,[http://t.co/z99pKJzTRp],1,1,0.333333,90,0.638298
3706,5271,fear,,The things we fear most in organizations--fluc...,0,138,thing fear organizationsfluctuations disturban...,[],0.0,0,[],0,[],0,0,0.125,103,0.746377
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0,23,tsunamiesh hey esh,[],0.0,0,[tsunami_esh],1,[],0,0,0.323529,18,0.782609
2887,4149,drown,,@POTUS you until you drown by water entering t...,0,140,potus drown water entering lung alive caused g...,[],0.0,0,[POTUS],1,[],0,0,0.09375,80,0.571429
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\r\nThese wounds they will ...,1,51,crawling skin wound hea,[],0.0,0,[],0,[],0,0,0.30303,23,0.45098


In [186]:
#Me quedo con las columnas numéricas
df = tweets.loc[:,['len_text','len_hashtag_over_text','hashtags_count','has_url','urls_count','users_count','keyword_encoded','len_clean_text','len_clean_text_over_text']]
df.head()

Unnamed: 0,len_text,len_hashtag_over_text,hashtags_count,has_url,urls_count,users_count,keyword_encoded,len_clean_text,len_clean_text_over_text
3228,141,0.0,0,1,1,0,0.333333,90,0.638298
3706,138,0.0,0,0,0,0,0.125,103,0.746377
6957,23,0.0,0,0,0,1,0.323529,18,0.782609
2887,140,0.0,0,0,0,1,0.09375,80,0.571429
7464,51,0.0,0,0,0,0,0.30303,23,0.45098


In [187]:
df_test = tweets_test.loc[:,['len_text','len_hashtag_over_text','hashtags_count','has_url','urls_count','users_count','keyword_encoded','len_clean_text','len_clean_text_over_text']]
df_test.head()

Unnamed: 0,len_text,len_hashtag_over_text,hashtags_count,has_url,urls_count,users_count,keyword_encoded,len_clean_text,len_clean_text_over_text
0,34,0.0,0,0,0,0,0.688525,27,0.794118
1,64,0.15625,1,0,0,0,0.688525,50,0.78125
2,96,0.0,0,0,0,0,0.688525,54,0.5625
3,40,0.4,2,0,0,0,0.688525,36,0.9
4,45,0.0,0,0,0,0,0.688525,34,0.755556


In [188]:
y = tweets['target']

In [189]:
params={
 "num_leaves"    : [40, 50, 60] ,
 "min_data_in_leaf" : [ 3, 4, 5, 6],
 "max_depth" : [ 3,4,5],
 "learning_rate": [ 0.1, 0.15, 0.2],
 "num_iterations" : [ 90, 100, 110, 120],
 "feature_fraction" : [0.4, 0.5, 0.6]
}

In [190]:
model = LGBMClassifier()

In [191]:
random_search = RandomizedSearchCV(model,param_distributions=params,scoring='f1',cv=5,verbose=1,n_iter=1400)

In [None]:
random_search.fit(df,y)

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_params_

In [None]:
model = random_search.best_estimator_

In [None]:
model.fit(df, y)

In [None]:
#Predicciones set de train
preds = model.predict(df)

In [None]:
f1score = f1_score(tweets['target'], preds)
print(f'Counts model score: {f1score}')

In [None]:
rmse = np.sqrt(mean_squared_error(tweets['target'], preds))
print(f'RMSE: %f' % (rmse))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(tweets['target'], preds))

In [None]:
#Predicciones set de test
preds_test = model.predict(df_test)
preds_test

In [None]:
tweets_test['target'] = preds_test.astype(int)
submission = tweets_test.loc[:,['id','target']]

In [None]:
submission.to_csv(path_or_buf='submissionlgbm.csv',header=True,index=False)

Feature importance

In [None]:
from lightgbm import plot_importance
fig, ax = plt.subplots(1,1,figsize=(10,10))
plot_importance(model, max_num_features=10, ax=ax)

In [None]:
#import pickle
#pickle.dump(model, open("modelolgbm", "wb"))