In [1]:
import pandas as pd
import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline
#from wordcloud import WordCloud
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict

In [2]:
tweets = pd.read_csv('train.csv')
tweets['length']=tweets['text'].str.len()
tweets['keyword'] = tweets['keyword'].str.replace('%20', ' ')
tweets['keyword'].fillna('no keyword', inplace = True)
tweets = tweets.sample(frac=1,random_state=1)
tweets_test = pd.read_csv('test.csv')

In [3]:
tweets['clean_text'] = tweets['text'].str.lower()
tweets_test['clean_text'] = tweets_test['text'].str.lower()

In [4]:
def only_letters(tweet):
    tweet = re.sub(r'http\S*', '', tweet)
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    return tweet

In [5]:
tweets['clean_text'] = tweets['clean_text'].apply(only_letters)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(only_letters)

In [6]:
#Tokenización
from nltk.tokenize import word_tokenize
tweets['clean_text'] = tweets['clean_text'].apply(word_tokenize)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(word_tokenize)
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [7]:
def filter_stopwords(tokenized_text):
    not_stopwords=[]
    for w in tokenized_text:
        if w not in stop_words:
            not_stopwords.append(w)
    return not_stopwords

In [8]:
tweets['clean_text'] = tweets['clean_text'].apply(filter_stopwords)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(filter_stopwords)

In [9]:
#Lematización
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
def lemmatize_tweet(tweet):
    lemmatized_words = []
    for word in tweet:
        lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

In [11]:
tweets['clean_text'] = tweets['clean_text'].apply(lemmatize_tweet)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lemmatize_tweet)
tweets['clean_text'] = tweets['clean_text'].apply(lambda text:' '.join(text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text:' '.join(text))
tweets['clean_text'] = tweets['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))

In [12]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target,length,clean_text
3228,4632,emergency services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1,141,goulburn man henry van bilsen missing emergenc...
3706,5271,fear,,The things we fear most in organizations--fluc...,0,138,thing fear organizationsfluctuations disturban...
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0,23,tsunamiesh hey esh
2887,4149,drown,,@POTUS you until you drown by water entering t...,0,140,potus drown water entering lung alive caused g...
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\r\nThese wounds they will ...,1,51,crawling skin wound hea


**TF-IDF**

In [13]:
tfidf_vectorizer=TfidfVectorizer(ngram_range=(1,1))
train_tfidf=tfidf_vectorizer.fit_transform(tweets['clean_text'])

In [14]:
X = train_tfidf
y = tweets['target'].values

**Regresión logística**

In [15]:
model=LogisticRegression(class_weight='balanced')
print(cross_val_score(model, X, y, cv=5,scoring='f1'))

[0.76775738 0.74918033 0.74350649 0.76475216 0.76836158]


In [16]:
#Set train
y_pred_log = cross_val_predict(model, X, y, cv=5)
f1score = f1_score(tweets['target'], y_pred_log)
print(f'Counts model score: {f1score*100}%')

Counts model score: 75.88093322606598%


In [104]:
pd.crosstab(tweets['target'],y_pred_log)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3756,586
1,913,2358


In [105]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(tweets['target'], y_pred_log))

[[3756  586]
 [ 913 2358]]


In [106]:
#Set test
model.fit(X, y)
test_predictions = model.predict(tfidf_vectorizer.transform(tweets_test['clean_text']))
test_predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [107]:
tweets_test['target'] = test_predictions

In [108]:
tweets_test.head(20)

Unnamed: 0,id,keyword,location,text,clean_text,target
0,0,,,Just happened a terrible car crash,just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",heard about earthquake is different city stay ...,1
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond goose are ...,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill in china and taiwan,1
5,12,,,We're shaking...It's an earthquake,were shakingits an earthquake,1
6,21,,,They'd probably still show more life than Arse...,theyd probably still show more life than arsen...,0
7,22,,,Hey! How are you?,hey how are you,0
8,27,,,What a nice hat?,what a nice hat,0
9,29,,,Fuck off!,fuck off,0


In [109]:
submission = tweets_test.loc[:,['id','target']]
submission.head(20)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [110]:
submission.to_csv(path_or_buf='submission.csv',header=True,index=False)

**Perceptrón**

In [111]:
from sklearn.linear_model import Perceptron

model=Perceptron(tol=1e-3, random_state=0)
print(cross_val_score(model, X, y, cv=5,scoring='f1'))

[0.71282455 0.69818755 0.7        0.71671827 0.73388931]


In [112]:
y_pred_per=cross_val_predict(model, X, y, cv=5)
f1score = f1_score(tweets['target'], y_pred_per)
print(f'Counts model score: {f1score*100}%')

Counts model score: 71.24092103229795%


In [113]:
pd.crosstab(tweets['target'],y_pred_per)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3447,895
1,966,2305


In [114]:
y_pred_per

array([1, 1, 0, ..., 1, 0, 0], dtype=int64)

In [115]:
model.fit(X, y)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

**Regresión lineal**

In [116]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [117]:
print(cross_val_score(model, X, y, cv=5))

[-0.2998534  -0.15704838 -0.06237869 -0.47954766 -0.19722186]


In [118]:
y_pred_lin=cross_val_predict(model, X, y, cv=5)

In [119]:
list(y_pred_lin)

[0.3792919582194989,
 1.020302556502917,
 -0.3968989963505333,
 0.5518397769690528,
 0.6762294374998441,
 0.07068145318525043,
 0.24999993898847472,
 0.4115732834550999,
 0.9999999631365436,
 0.31232053360838763,
 -0.31684420765019483,
 -0.5707361384651334,
 0.10217313124202343,
 9.83800441112237e-09,
 0.6583692220277825,
 0.39480420179223064,
 0.7576465691520801,
 0.29707000702235387,
 0.8414316719825469,
 0.6594410569012634,
 0.7903140920345779,
 -0.5604008316922479,
 1.077084968503789,
 -0.5244024708307543,
 0.38461450735757263,
 1.0219423872055187,
 0.6686873942949018,
 0.4912418412951175,
 -0.20250342894094853,
 0.2213450020954918,
 -0.21609538728746835,
 1.306433395968765,
 -0.2404767764459529,
 0.9160827986537066,
 0.48592359302125016,
 0.2669935925485436,
 0.9999998391966851,
 -0.5941818184754416,
 -0.31266425733161995,
 0.9107938788401402,
 1.2902570037032233,
 0.58370019474643,
 0.2424179824082694,
 -0.14908599274678125,
 0.23421848674957035,
 0.22404284293822796,
 0.74172459

In [120]:
y_pred_lin = y_pred_lin >=0.5
        
f1score = f1_score(tweets['target'], y_pred_lin)
print(f'Counts model score: {f1score*100}%')

Counts model score: 65.19836227587182%


In [121]:
pd.crosstab(tweets['target'],y_pred_lin)

col_0,False,True
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2839,1503
1,962,2309


In [122]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

**Promediando resultados**

In [123]:
y_p = (y_pred_log + y_pred_per + y_pred_lin)/3
y_p = y_p >=0.5
y_p

array([ True,  True, False, ...,  True, False, False])

In [124]:
f1score = f1_score(tweets['target'], y_p)
print(f'Counts model score: {f1score*100}%')

Counts model score: 73.48591001089834%


In [125]:
#no sirve, es mejor la regresión logística sola