In [573]:
import pandas as pd
import datetime as dt
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier, BaggingRegressor, AdaBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from time import time
from nltk.corpus import stopwords

In [574]:
import re
import string
import nltk
from nltk.corpus import stopwords

In [575]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,GRU
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

In [576]:
from nltk.stem.snowball import SnowballStemmer

In [577]:
stemmer=SnowballStemmer(language='english')

In [578]:
def stemmizar_texto(texto):
    texto=' '.join([stemmer.stem(palabra) for palabra in texto.split() ])
    return texto

In [579]:
nltk.download('stopwords')
english_stopwords=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Santi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [580]:
train_set=pd.read_csv("train.csv",encoding='utf-8')
test_set=pd.read_csv("test.csv",encoding='utf-8')

In [581]:
def quitar_stopwords(texto):
    texto = texto.lower()
    texto=' '.join([word for word in texto.split() if word not in english_stopwords])
    return texto

def quitar_menciones(texto):
    texto=' '.join([palabra for palabra in texto.split() if '@' not in palabra])
    return texto

def quitar_links(texto):
    texto=' '.join([palabra for palabra in texto.split() if 'http:/' not in palabra])
    return texto    

In [582]:
Diccionario_de_lenguaje_de_internet={
"2moro": "Tomorrow",
"2nite": "Tonight",
"4EAE": "For Ever And Ever",
"ABT":"About",
"ADN": "Any Day Now",
"AFAIC": "As Far As I’m Concerned",
"AFAICT": "As Far As I Can Tell",
"AFAIK": "As Far As I Know",
"AFAIR": "As Far As I Remember",
"AKA": "Also Known As",
"AMA": "Ask Me Anything",
"ASAIC": "As Soon As I Can",
"ASAP": "As Soon As Possible",
"ATM": "At The Moment",
"B4": "Before",
"B4N": "Bye For Now",
"Bae": "Babe/Before Anyone Else",
"BBL": "Be Back Later",
"BBT": "Be Back Tomorrow",
"BCNU": "Be Seeing You",
"BD": "Big Deal",
"BF": "Boyfriend",
"BFF": "Best Friends Forever",
"BMT": "Before My Time",
"BOL": "Be On Later",
"BOT": "Back On Topic",
"BRB": "Be Right Back",
"BRO": "Brother",
"BT": "But",
"BTW": "By The Way",
"CFY": "Calling For You",
"CU": "See You",
"CUL": "See You Later",
"Cuz": "Because",
"CYA": "Cover Your Ass",
"DAE": "Does Anyone Else",
"DBA": "Doing Business As",
"DFTBA": "Don’t Forget To Be Awesome",
"DIKU": "Do I Know You",
"DM": "Direct Message",
"DND": "Do Not Disturb",
"DR": "Double Rainbow",
"DWBH": "Don’t Worry, Be Happy",
"ELI5": "Explain Like I’m 5",
"EOM": "End Of Message",
"EOS": "End Of Story",
"F2F": "Face To Face",
"FAQ": "Frequently Asked Question",
"FB": "Facebook",
"FBF": "Flash Back Friday",
"FF": "Follow Friday",
"FIFY": "Fixed It For You",
"FITB": "Fill In The Blank",
"FML": "Fuck My Life",
"FOMO": "Fear Of Missing Out",
"FTFY": "Fixed That For You",
"FTL": "For The Loss",
"FTW": "For The Win",
"FWB": "Friends With Benefits",
"FWIW": "For What It’s Worth",
"FYE": "For Your Entertainment",
"FYEO": "For Your Eyes Only",
"FYI": "For Your Information",
"GA": "Go Ahead",
"GAL": "Get A Life",
"GF": "Girlfriend",
"GM": "Good Morning",
"GN": "Good Night",
"Gr8": "Great",
"GTR": "Getting Ready",
"HB": "Hurry Back",
"HBD": "Happy Birthday",
"HBU": "How About You",
"HMB": "Hit Be Back",
"HMU": "Hit Me Up",
"HRU": "How Are You",
"HTH": "Hope This Helps",
"IAC": "In Any Case",
"IC": "I See",
"ICYMI": "In Case You Missed It",
"IDC": "I Don’t Care",
"IDK": "I Don’t Know",
"IG": "Instagram",
"IIRC": "If I Remember Correctly",
"IKR": "I Know Right",
"ILY": "I Love You",
"IMHO": "In My Humble Opinion",
"IMMD": "It Made My Day",
"IMY": "I Miss You",
"IRL": "In Real Life",
"IS": "I’m Sorry",
"ISO": "In Search Of",
"IU2U": "It’s Up To You",
"J4F": "Just For Fun",
"JAM": "Just A Minute",
"JFY": "Just For You",
"JIC": "Just In Case",
"JK": "Just Kidding",
"JSYK": "Just So You Know",
"KK": "Okay",
"L8": "Late",
"L8R": "Later",
"LMA": "Leave Me Alone",
"LMAO": "Laughing My Ass Off",
"LMBO": "Laughing My Butt Off",
"LMK": "Let Me Know",
"LOL": "Laugh Out Loud",
"LTNS": "Long Time No See",
"LYLAS": "Love You Like A Sister",
"M/F": "Male or Female",
"M8": "Mate",
"MP": "My pleasure",
"MSM": "Mainstream Media",
"MU": "Miss You",
"MYOB": "Mind Your Own Business",
"NAGI": "Not A Good Idea",
"NBD": "No Big Deal",
"NE1": "Anyone",
"NM": "Not Much",
"NP": "No Problem",
"NSFL": "Not Safe For Life",
"NSFW": "Not Safe For Work",
"NTS": "Note To Self",
"NVM": "Never Mind",
"OC": "Original Content",
"OIC": "Oh ! I See",
"OMD": "Oh My Damn",
"OMG": "Oh My Goodness",
"OMW": "On My Way",
"OT": "Off Topic",
"OFC": "Of course",
"PAW": "Parents Are Watching",
"Pls": "Please",
"POTD": "Photo Of The Day",
"POV": "Point Of View",
"PPL": "People",
"PTB": "Please Text Back",
"Q4U": "Question For You",
"QQ": "Crying",
"RBTL": "Read Between The Lines",
"RIP": "Rest In Peace",
"RL": "Real Life",
"ROFL": "Rolling On the Floor Laughing",
"RT": "Retweet",
"RTM": "Read The Manual",
"SIS": "Sister",
"SITD": "Still In The Dark",
"SM": "Social Media",
"SMH": "Shaking My Head",
"SMY": "Somebody",
"SNH": "Sarcasm Noted Here",
"SOL": "Sooner Or Later",
"Some1": "Someone",
"SRSLY": "Seriously",
"STBY": "Sucks To Be You",
"Str8": "Straight",
"SYS": "See You Soon",
"TBA": "To Be Announced",
"TBH": "To Be Honest",
"TBT": "Throwback Thursday",
"TBT": "Truth Be Told",
"TFH": "Thread From Hell",
"TFTI": "Thanks For The Invite",
"TGIF": "Thank God It’s Friday",
"THX": "Thanks",
"TIA": "Thanks in Advance",
"TIL": "Today I Learned",
"TL;DR": "Too Long; Didn’t Read",
"TLDR":"Too long didn’t read",
"TL DR":"Too long didn’t read",
"TLC": "Tender Loving Care",
"TMI": "Too Much Information",
"TTYL": "Talk To You Later",
"TTYS": "Talk To You Soon",
"Txt": "Text",
"TYVM": "Thank You Very Much",
"U": "You",
"U4F": "You Forever",
"UR": "Your",
"VBG": "Very Big Grin",
"VSF": "Very Sad Face",
"WB": "Welcome Back",
"WBU": "What About You?",
"WEG": "Wicked Evil Grin",
"WKND": "Weekend",
"WOM": "Word of Mouth",
"WOTD": "Word Of The Day",
"Wru": "Who Are You",
"WTH": "What The Heck?",
"WTPA": "Where The Party At?",
"WU?": "What's Up",
"WU":"What's Up",
"WYCM": "Will You Call Me?",
"WYWH": "Wish You Were Here",
"XOXO": "Hugs and Kisses",
"YGM": "You’ve Got Mail",
"YNK": "You Never Know",
"YOLO": "You Only Live Once",
"YT": "YouTube",
"YW": "You’re Welcome",
}

In [583]:
def reemplazar_lenguaje_internet(texto):
    texto=texto.upper()
    palabras=texto.split()
    palabras_procesadas=[]
    for palabra in palabras:
        traduccion=Diccionario_de_lenguaje_de_internet.get(palabra,'not internet slang')
        if(traduccion!='not internet slang'):
            lista_aux=traduccion.split()
            for x in lista_aux:
                palabras_procesadas.append(x.lower())
        else:
            palabras_procesadas.append(palabra.lower())
    texto=' '.join([word for word in palabras_procesadas])
    return texto

In [584]:
train_set

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


### Feature: Cantidad de caracteres usados en tweets

In [585]:
cuenta_caract = train_set[['id','text']]
cuenta_caract['caracteres_usados'] = cuenta_caract['text'].str.len()
cuenta_caract_t = test_set[['id','text']]
cuenta_caract_t['caracteres_usados'] = cuenta_caract_t['text'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cuenta_caract['caracteres_usados'] = cuenta_caract['text'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cuenta_caract_t['caracteres_usados'] = cuenta_caract_t['text'].str.len()


In [586]:
cuenta_caract = cuenta_caract[['id','caracteres_usados']]
cuenta_caract_t = cuenta_caract_t[['id','caracteres_usados']]

In [587]:
train_set=train_set.merge(cuenta_caract,on='id',how='left')

In [588]:
test_set=test_set.merge(cuenta_caract_t,on='id',how='left')

### Feature: cantidad de menciones por tweet

In [589]:
cant_menciones = train_set[['id','text']]
cant_menciones['menciones_realizadas']=cant_menciones['text'].str.count('@')
cant_menciones_t = test_set[['id','text']]
cant_menciones_t['menciones_realizadas']=cant_menciones_t['text'].str.count('@')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cant_menciones['menciones_realizadas']=cant_menciones['text'].str.count('@')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cant_menciones_t['menciones_realizadas']=cant_menciones_t['text'].str.count('@')


In [590]:
cant_menciones = cant_menciones[['id','menciones_realizadas']]
cant_menciones_t=cant_menciones_t[['id','menciones_realizadas']]

In [591]:
train_set=train_set.merge(cant_menciones,on='id',how='left')

In [592]:
test_set=test_set.merge(cant_menciones_t,on='id',how='left')

### Feature: ID que comparten localización

In [593]:
tiene_localizacion = train_set[['id','location']]
tiene_localizacion_t = test_set[['id','location']]

In [594]:
tiene_localizacion['location'].loc[~(tiene_localizacion['location'].isnull())]=1
tiene_localizacion_t['location'].loc[~(tiene_localizacion_t['location'].isnull())]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tiene_localizacion['location'].loc[~(tiene_localizacion['location'].isnull())]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tiene_localizacion_t['location'].loc[~(tiene_localizacion_t['location'].isnull())]=1


In [595]:
tiene_localizacion.fillna(0,inplace=True)
tiene_localizacion_t.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [596]:
tiene_localizacion['location'].value_counts()

1    5080
0    2533
Name: location, dtype: int64

In [597]:
tiene_localizacion = tiene_localizacion.rename(columns={'location':'permite_location'})
tiene_localizacion_t = tiene_localizacion_t.rename(columns={'location':'permite_location'})

In [598]:
train_set=train_set.merge(tiene_localizacion,on='id',how='left')

In [599]:
test_set=test_set.merge(tiene_localizacion_t,on='id',how='left')

### Feature: usa Keyword

In [600]:
usa_keyword = train_set[['id','keyword']]
usa_keyword['keyword'].loc[~(usa_keyword['keyword'].isnull())] = 1
usa_keyword_t = test_set[['id','keyword']]
usa_keyword_t['keyword'].loc[~(usa_keyword_t['keyword'].isnull())] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_keyword['keyword'].loc[~(usa_keyword['keyword'].isnull())] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_keyword_t['keyword'].loc[~(usa_keyword_t['keyword'].isnull())] = 1


In [601]:
usa_keyword.fillna(0,inplace=True)
usa_keyword_t.fillna(0,inplace=True)

In [602]:
usa_keyword = usa_keyword.rename(columns={'keyword':'use_keyword'})
usa_keyword_t = usa_keyword_t.rename(columns={'keyword':'use_keyword'})

In [603]:
train_set = train_set.merge(usa_keyword,on='id',how='left')

In [604]:
test_set=test_set.merge(usa_keyword_t,on='id',how='left')

### Feature: Cita URL en Tweet

In [605]:
usa_url = train_set[['id','text']]
usa_url_t = test_set[['id','text']]

In [606]:
usa_url['cita_url'] = usa_url['text'].str.count('http')
usa_url_t['cita_url'] = usa_url_t['text'].str.count('http')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_url['cita_url'] = usa_url['text'].str.count('http')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_url_t['cita_url'] = usa_url_t['text'].str.count('http')


In [607]:
usa_url = usa_url[['id','cita_url']]
usa_url_t = usa_url_t[['id','cita_url']]

In [608]:
train_set = train_set.merge(usa_url,on='id',how='left')

In [609]:
test_set=test_set.merge(usa_url_t,on='id',how='left')

### Feature: usa Hashtag

In [610]:
usa_hashtag = train_set[['id','text']]
usa_hashtag['use_hashtag']=usa_hashtag['text'].str.count('#')
usa_hashtag_t = test_set[['id','text']]
usa_hashtag_t['use_hashtag']=usa_hashtag_t['text'].str.count('#')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_hashtag['use_hashtag']=usa_hashtag['text'].str.count('#')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_hashtag_t['use_hashtag']=usa_hashtag_t['text'].str.count('#')


In [611]:
usa_hashtag = usa_hashtag[['id','use_hashtag']]
usa_hashtag_t = usa_hashtag_t[['id','use_hashtag']]

In [612]:
train_set = train_set.merge(usa_hashtag,on='id',how='left')

In [613]:
test_set=test_set.merge(usa_hashtag_t,on='id',how='left')

### Feature: Cantidad de palabras

In [614]:
cant_palabras = train_set[['id', 'text']]
cant_palabras['cant_palabras'] = cant_palabras['text'].str.count(' ') + 1
cant_palabras_t = test_set[['id', 'text']]
cant_palabras_t['cant_palabras'] = cant_palabras_t['text'].str.count(' ') + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cant_palabras['cant_palabras'] = cant_palabras['text'].str.count(' ') + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cant_palabras_t['cant_palabras'] = cant_palabras_t['text'].str.count(' ') + 1


In [615]:
cant_palabras = cant_palabras[['id', 'cant_palabras']]
cant_palabras_t = cant_palabras_t[['id', 'cant_palabras']]

In [616]:
train_set = train_set.merge(cant_palabras,on='id',how='left')
test_set = test_set.merge(cant_palabras_t,on='id',how='left')

### Feature: Cantidad de abreviaciones de internet

In [617]:
def contar_abreviaciones(data):
    lista_cant = []
    for tweet in data:
        cant = 0
        tweet = quitar_stopwords(tweet)
        tweet = tweet.upper()
        tweet = tweet.split()
        for word in tweet:
            if (word in Diccionario_de_lenguaje_de_internet):
                cant = cant + 1
        lista_cant = lista_cant + [cant]
    return lista_cant

In [618]:
cant_abreviaciones = train_set[['id', 'text']]
cant_abreviaciones['cant_abreviaciones'] = contar_abreviaciones(cant_abreviaciones['text'])
cant_abreviaciones_t = test_set[['id', 'text']]
cant_abreviaciones_t['cant_abreviaciones'] = contar_abreviaciones(cant_abreviaciones_t['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cant_abreviaciones['cant_abreviaciones'] = contar_abreviaciones(cant_abreviaciones['text'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cant_abreviaciones_t['cant_abreviaciones'] = contar_abreviaciones(cant_abreviaciones_t['text'])


In [619]:
cant_abreviaciones = cant_abreviaciones[['id', 'cant_abreviaciones']]
cant_abreviaciones_t = cant_abreviaciones_t[['id', 'cant_abreviaciones']]

In [620]:
train_set = train_set.merge(cant_abreviaciones,on='id',how='left')
test_set = test_set.merge(cant_abreviaciones_t,on='id',how='left')

### Feature: Location en Estados Unidos

In [621]:
lista_estados_usa = ["ALABAMA", "AL", "ALASKA", "AK", 
                     "ARIZONA", "AZ", "ARKANSAS", "AR", 
                     "CALIFORNIA", "CA", "COLORADO", "CO", 
                     "CONNECTICUT", "CT", "DELAWARE", "DE", 
                     "FLORIDA", "FL", "GEORGIA", "GA", 
                     "HAWAII", "HI", "IDAHO", "ID", 
                     "ILLINOIS", "IL", "INDIANA", "IN", 
                     "IOWA", "IA", "KANSAS", "KS", 
                     "KENTUCKY", "KY", "LOUISIANA", "LA", 
                     "MAINE", "ME", "MARYLAND", "MD", 
                     "MASSACHUSETTS", "MA", "MICHIGAN", "MI", 
                     "MINNESOTA", "MN", "MISSISSIPPI", "MS", 
                     "MISSOURI", "MO", "MONTANA", "MT", 
                     "NEBRASKA", "NE", "NEVADA", "NV", 
                     "NEW HAMPSHIRE", "NH", "NEW JERSEY", 
                     "NJ", "NEW MEXICO", "NM", "NEW YORK", 
                     "NY", "NORTH CAROLINA", "NC", "NORTH DAKOTA", 
                     "ND", "OHIO", "OH", "OKLAHOMA", "OK", 
                     "OREGON", "OR", "PENNSYLVANIA", "PA", 
                     "RHODE ISLAND", "RI", "SOUTH CAROLINA", "SC", "CAROLINA", 
                     "SOUTH DAKOTA", "SD", "TENNESSEE", "TN", 
                     "TEXAS", "TX", "UTAH", "UT", 
                     "VERMONT", "VT", "VIRGINIA", "VA", 
                     "WASHINGTON", "WA", "WEST VIRGINIA", "WV", 
                     "WISCONSIN", "WI", "WYOMING", "WY",
                     "USA", "UNITED STATES", "SAN FRANCISCO"]

In [622]:
def ver_location_usa(data):
    lista_ubicacion_usa = []
    for location in data:
        if (location is None):
            lista_ubicacion_usa += [0]
        else:
            location_usa = False
            location = str(location)
            location = location.upper()
            if (location in lista_estados_usa):
                location_usa = True
            location = location.split()
            for word in location:
                if (word in lista_estados_usa):
                    location_usa = True
                    break
            if (location_usa):
                lista_ubicacion_usa += [1]
            else:
                lista_ubicacion_usa += [0]
    return lista_ubicacion_usa

In [623]:
location_usa = train_set[['id', 'location']]
location_usa['location_usa'] = ver_location_usa(location_usa['location'])
location_usa_t = test_set[['id', 'location']]
location_usa_t['location_usa'] = ver_location_usa(location_usa_t['location'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_usa['location_usa'] = ver_location_usa(location_usa['location'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_usa_t['location_usa'] = ver_location_usa(location_usa_t['location'])


In [624]:
location_usa = location_usa[['id', 'location_usa']]
location_usa_t = location_usa_t[['id', 'location_usa']]

In [625]:
train_set = train_set.merge(location_usa,on='id',how='left')
test_set = test_set.merge(location_usa_t,on='id',how='left')

### Feature: Tweet tiene caritas

In [626]:
lista_caritas = [":)", ";)", ":(", ";(", "XD", "xD", "xd", ":P", ":p", "-_-", ":O", ":'(", ":D", ":-D"]

In [627]:
def buscar_caritas(data):
    tiene_carita = []
    for tweet in data:
        tweet = quitar_menciones(tweet)
        tweet = quitar_links(tweet)
        hay_carita = False
        for carita in lista_caritas:
            if (carita in tweet):
                hay_carita = True
                break
        if (hay_carita):
            tiene_carita += [1]
        else:
            tiene_carita += [0]
    return tiene_carita

In [628]:
has_emoji = train_set[['id', 'text']]
has_emoji['has_emoji'] = buscar_caritas(has_emoji['text'])
has_emoji_t = test_set[['id', 'text']]
has_emoji_t['has_emoji'] = buscar_caritas(has_emoji_t['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_emoji['has_emoji'] = buscar_caritas(has_emoji['text'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_emoji_t['has_emoji'] = buscar_caritas(has_emoji_t['text'])


In [629]:
has_emoji = has_emoji[['id', 'has_emoji']]
has_emoji_t = has_emoji_t[['id', 'has_emoji']]

In [630]:
train_set = train_set.merge(has_emoji,on='id',how='left')
test_set = test_set.merge(has_emoji_t,on='id',how='left')

### Feature: Palabras con misma letra repetida

In [636]:
lista_letras_seguidas = ["aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh", "iii", "jjj", "kkk", "lll", "mmm", "nnn", "ooo", "ppp", "qqq", "rrr", "sss", "ttt", "uuu", "vvv", "www", "xxx", "yyy", "zzz"]

In [638]:
def buscar_letras_seguidas(data):
    tiene_letras_seguidas = []
    for tweet in data:
        tweet = quitar_menciones(tweet)
        tweet = quitar_links(tweet)
        hay_letra_seguida = False
        for letra in lista_letras_seguidas:
            if (letra in tweet):
                hay_letra_seguida = True
                break
        if (hay_letra_seguida):
            tiene_letras_seguidas += [1]
        else:
            tiene_letras_seguidas += [0]
    return tiene_letras_seguidas

In [631]:
train_set.to_csv('train_data.csv')

In [632]:
test_set.to_csv('test_data.csv')

In [633]:
train_set

Unnamed: 0,id,keyword,location,text,target,caracteres_usados,menciones_realizadas,permite_location,use_keyword,cita_url,use_hashtag,cant_palabras,cant_abreviaciones,location_usa,has_emoji
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,0,0,0,0,1,13,0,0,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,0,0,0,0,0,7,0,0,0
2,5,,,All residents asked to 'shelter in place' are ...,1,133,0,0,0,0,0,22,0,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,0,0,0,0,1,9,0,0,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,0,0,0,0,2,17,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,83,0,0,0,1,0,11,0,0,0
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,125,2,0,0,0,0,20,0,0,0
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,65,0,0,0,1,0,8,0,0,0
7611,10872,,,Police investigating after an e-bike collided ...,1,137,0,0,0,0,0,19,0,0,0


In [634]:
test_set

Unnamed: 0,id,keyword,location,text,caracteres_usados,menciones_realizadas,permite_location,use_keyword,cita_url,use_hashtag,cant_palabras,cant_abreviaciones,location_usa,has_emoji
0,0,,,Just happened a terrible car crash,34,0,0,0,0,0,6,0,0,0
1,2,,,"Heard about #earthquake is different cities, s...",64,0,0,0,0,1,9,0,0,0
2,3,,,"there is a forest fire at spot pond, geese are...",96,0,0,0,0,0,19,0,0,0
3,9,,,Apocalypse lighting. #Spokane #wildfires,40,0,0,0,0,2,4,0,0,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45,0,0,0,0,0,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,55,0,0,0,0,0,8,0,0,0
3259,10865,,,Storm in RI worse than last hurricane. My city...,139,0,0,0,0,0,23,0,0,0
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,55,0,0,0,1,0,6,0,0,0
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,65,0,0,0,1,0,7,0,0,0


In [635]:
prueba['hola'] = prueba['text'].apply(reemplazar_lenguaje_internet)

NameError: name 'prueba' is not defined

In [None]:
train_set.groupby('keyword').count()

In [None]:
'YOLO' in Diccionario_de_lenguaje_de_internet