In [281]:
import pandas as pd
import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline
#from wordcloud import WordCloud
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict

In [282]:
tweets = pd.read_csv('train.csv')
tweets['length']=tweets['text'].str.len()
tweets['keyword'] = tweets['keyword'].str.replace('%20', ' ')
tweets['keyword'].fillna('no keyword', inplace = True)
tweets = tweets.sample(frac=1,random_state=1)
tweets_test = pd.read_csv('test.csv')

In [283]:
tweets['clean_text'] = tweets['text'].str.lower()
tweets_test['clean_text'] = tweets_test['text'].str.lower()

In [284]:
def only_letters(tweet):
    tweet = re.sub(r'http\S*', '', tweet)
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    return tweet

In [285]:
tweets['clean_text'] = tweets['clean_text'].apply(only_letters)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(only_letters)

In [286]:
#Tokenización
from nltk.tokenize import word_tokenize
tweets['clean_text'] = tweets['clean_text'].apply(word_tokenize)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(word_tokenize)
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [287]:
def filter_stopwords(tokenized_text):
    not_stopwords=[]
    for w in tokenized_text:
        if w not in stop_words:
            not_stopwords.append(w)
    return not_stopwords

In [288]:
tweets['clean_text'] = tweets['clean_text'].apply(filter_stopwords)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(filter_stopwords)

In [289]:
#Lematización
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [290]:
def lemmatize_tweet(tweet):
    lemmatized_words = []
    for word in tweet:
        lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

In [291]:
tweets['clean_text'] = tweets['clean_text'].apply(lemmatize_tweet)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lemmatize_tweet)
tweets['clean_text'] = tweets['clean_text'].apply(lambda text:' '.join(text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text:' '.join(text))
tweets['clean_text'] = tweets['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))

In [292]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target,length,clean_text
3228,4632,emergency services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1,141,goulburn man henry van bilsen missing emergenc...
3706,5271,fear,,The things we fear most in organizations--fluc...,0,138,thing fear organizationsfluctuations disturban...
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0,23,tsunamiesh hey esh
2887,4149,drown,,@POTUS you until you drown by water entering t...,0,140,potus drown water entering lung alive caused g...
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\r\nThese wounds they will ...,1,51,crawling skin wound hea


**TF-IDF**

In [293]:
tfidf_vectorizer=TfidfVectorizer(ngram_range=(1,1),min_df=2)
train_tfidf=tfidf_vectorizer.fit_transform(tweets['clean_text'])

In [294]:
X = train_tfidf
y = tweets['target'].values

**Regresión logística**

In [295]:
model=LogisticRegression(class_weight='balanced')
print(cross_val_score(model, X, y, cv=5,scoring='f1'))

[0.76837945 0.76198083 0.74841772 0.76183088 0.76062992]


In [296]:
#Set train
y_pred_log = cross_val_predict(model, X, y, cv=5)
f1score = f1_score(tweets['target'], y_pred_log)
print(f'Counts model score: {f1score*100}%')

Counts model score: 76.02523659305994%


In [297]:
pd.crosstab(tweets['target'],y_pred_log)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3683,659
1,861,2410


In [298]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(tweets['target'], y_pred_log))

[[3683  659]
 [ 861 2410]]


In [299]:
#Regressor
y_pred_log_probabilities = cross_val_predict(model, X, y, cv=5, method='predict_proba')
y_pred_log_probabilities = np.array(y_pred_log_probabilities)

In [300]:
y_train_log_prob = y_pred_log_probabilities[:,1]

In [301]:
np.savetxt("train_log_prob.csv", pd.DataFrame(y_train_log_prob))

In [302]:
#Set test
model.fit(X, y)
test_predictions = model.predict(tfidf_vectorizer.transform(tweets_test['clean_text']))
test_predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [303]:
tweets_test['target'] = test_predictions

In [304]:
tweets_test.head(20)

Unnamed: 0,id,keyword,location,text,clean_text,target
0,0,,,Just happened a terrible car crash,happened terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...,1
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan,1
5,12,,,We're shaking...It's an earthquake,shakingits earthquake,1
6,21,,,They'd probably still show more life than Arse...,theyd probably still show life arsenal yesterd...,0
7,22,,,Hey! How are you?,hey,0
8,27,,,What a nice hat?,nice hat,0
9,29,,,Fuck off!,fuck,0


In [305]:
submission = tweets_test.loc[:,['id','target']]
submission.head(20)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [306]:
submission.to_csv(path_or_buf='submission.csv',header=True,index=False)

In [307]:
#print(list(y_pred_log))

In [308]:
#Test proba

In [309]:
test_predictions = model.predict_proba(tfidf_vectorizer.transform(tweets_test['clean_text']))
test_predictions = np.array(test_predictions)
test_predictions

array([[0.21177565, 0.78822435],
       [0.3256102 , 0.6743898 ],
       [0.19248937, 0.80751063],
       ...,
       [0.22083745, 0.77916255],
       [0.25862416, 0.74137584],
       [0.44719953, 0.55280047]])

In [310]:
y_test_log_prob = test_predictions[:,1]

In [311]:
y_test_log_prob

array([0.78822435, 0.6743898 , 0.80751063, ..., 0.77916255, 0.74137584,
       0.55280047])

In [312]:
np.savetxt("test_log_prob.csv", pd.DataFrame(y_test_log_prob))

**Perceptrón**

In [313]:
from sklearn.linear_model import Perceptron

model=Perceptron(tol=1e-3, random_state=0)
print(cross_val_score(model, X, y, cv=5,scoring='f1'))

[0.68527132 0.69300912 0.69923664 0.69327421 0.69353612]


In [314]:
y_pred_per=cross_val_predict(model, X, y, cv=10)
f1score = f1_score(tweets['target'], y_pred_per)
print(f'Counts model score: {f1score*100}%')

Counts model score: 69.41643751909564%


In [315]:
pd.crosstab(tweets['target'],y_pred_per)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3339,1003
1,999,2272


In [316]:
y_pred_per

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [317]:
model.fit(X, y)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

**Regresión lineal**

In [318]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [319]:
print(cross_val_score(model, X, y, cv=5))

[ -26.39675499  -45.62558326  -37.26173536  -21.75104382 -104.5037961 ]


In [320]:
y_pred_lin=cross_val_predict(model, X, y, cv=5)
np.savetxt("train_lin.csv", pd.DataFrame(y_pred_lin))

In [321]:
#list(y_pred_lin)

In [322]:
y_pred_lin = y_pred_lin >=0.5
        
f1score = f1_score(tweets['target'], y_pred_lin)
print(f'Counts model score: {f1score*100}%')

Counts model score: 57.326846021751564%


In [323]:
pd.crosstab(tweets['target'],y_pred_lin)

col_0,False,True
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2628,1714
1,1268,2003


In [324]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [325]:
test_lin_predictions = model.predict(tfidf_vectorizer.transform(tweets_test['clean_text']))
#list(test_lin_predictions)

In [326]:
tweets_test['lin_pred'] = test_lin_predictions
submission = tweets_test.loc[:,['id','lin_pred']]
submission.head(10)

Unnamed: 0,id,lin_pred
0,0,0.789428
1,2,0.575369
2,3,0.568077
3,9,0.83326
4,11,0.642
5,12,1.126309
6,21,-0.052035
7,22,0.146149
8,27,-0.246498
9,29,-0.288499


In [327]:
submission.to_csv(path_or_buf='lin_pred.csv',header=True,index=False)

**Promediando resultados**

In [328]:
y_p = (y_pred_log + y_pred_per + y_pred_lin)/3
y_p = y_p >=0.5
y_p

array([False, False, False, ...,  True, False, False])

In [329]:
f1score = f1_score(tweets['target'], y_p)
print(f'Counts model score: {f1score*100}%')

Counts model score: 72.2127196392474%


In [330]:
#no sirve, es mejor la regresión logística sola