In [99]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict

In [100]:
tweets = pd.read_csv('train.csv')
tweets = tweets.sample(frac=1,random_state=1)
tweets_test = pd.read_csv('test.csv')

In [101]:
tweets['clean_text'] = tweets['text'].str.lower()
tweets_test['clean_text'] = tweets_test['text'].str.lower()

In [102]:
def only_letters(tweet):
    tweet = re.sub(r'http\S*', '', tweet)
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    return tweet

In [103]:
tweets['clean_text'] = tweets['clean_text'].apply(only_letters)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(only_letters)

In [104]:
#Tokenización
from nltk.tokenize import word_tokenize
tweets['clean_text'] = tweets['clean_text'].apply(word_tokenize)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(word_tokenize)
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [105]:
#Eliminación de stopwords
def filter_stopwords(tokenized_text):
    not_stopwords=[]
    for w in tokenized_text:
        if w not in stop_words:
            not_stopwords.append(w)
    return not_stopwords

In [106]:
tweets['clean_text'] = tweets['clean_text'].apply(filter_stopwords)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(filter_stopwords)

In [107]:
#Lematización
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [108]:
def lemmatize_tweet(tweet):
    lemmatized_words = []
    for word in tweet:
        lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

In [109]:
tweets['clean_text'] = tweets['clean_text'].apply(lemmatize_tweet)
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lemmatize_tweet)
tweets['clean_text'] = tweets['clean_text'].apply(lambda text:' '.join(text))
tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text:' '.join(text))
#tweets['clean_text'] = tweets['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))
#tweets_test['clean_text'] = tweets_test['clean_text'].apply(lambda text: re.sub(r'amp | im', '', text))

In [110]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
3228,4632,emergency%20services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1,goulburn man henry van bilsen missing emergenc...
3706,5271,fear,,The things we fear most in organizations--fluc...,0,thing fear organizationsfluctuations disturban...
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0,tsunamiesh hey esh
2887,4149,drown,,@POTUS you until you drown by water entering t...,0,potus drown water entering lung alive caused g...
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\r\nThese wounds they will ...,1,crawling skin wound hea


**TF-IDF**

In [111]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df=2)
train_tfidf = tfidf_vectorizer.fit_transform(tweets['clean_text'])

In [112]:
X = train_tfidf
y = tweets['target'].values

In [113]:
X_test = tfidf_vectorizer.transform(tweets_test['clean_text'])

**Regresión logística**

In [114]:
model = LogisticRegression(class_weight='balanced')
print(cross_val_score(model, X, y, cv=5,scoring='f1'))

[0.76837945 0.76198083 0.74841772 0.76183088 0.76062992]


In [115]:
#Predicciones set de train
y_pred_log = cross_val_predict(model, X, y, cv=5)
f1score = f1_score(tweets['target'], y_pred_log)
print(f'Counts model score: {f1score}')

Counts model score: 0.7602523659305994


In [116]:
pd.crosstab(tweets['target'],y_pred_log)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3683,659
1,861,2410


In [117]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(tweets['target'], y_pred_log))

[[3683  659]
 [ 861 2410]]


In [118]:
#Predicciones set de test
model.fit(X, y)
test_predictions = model.predict(X_test)
test_predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [119]:
tweets_test['target'] = test_predictions

In [120]:
tweets_test.head(10)

Unnamed: 0,id,keyword,location,text,clean_text,target
0,0,,,Just happened a terrible car crash,happened terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...,1
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan,1
5,12,,,We're shaking...It's an earthquake,shakingits earthquake,1
6,21,,,They'd probably still show more life than Arse...,theyd probably still show life arsenal yesterd...,0
7,22,,,Hey! How are you?,hey,0
8,27,,,What a nice hat?,nice hat,0
9,29,,,Fuck off!,fuck,0


In [121]:
submission = tweets_test.loc[:,['id','target']]
submission.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [122]:
submission.to_csv(path_or_buf='submissiontfidflogreg.csv',header=True,index=False)

In [123]:
#Predicciones set de test (probabilidad)

In [124]:
test_predictions = model.predict_proba(tfidf_vectorizer.transform(tweets_test['clean_text']))
test_predictions = np.array(test_predictions)
test_predictions

array([[0.21177565, 0.78822435],
       [0.3256102 , 0.6743898 ],
       [0.19248937, 0.80751063],
       ...,
       [0.22083745, 0.77916255],
       [0.25862416, 0.74137584],
       [0.44719953, 0.55280047]])

In [125]:
y_test_log_prob = test_predictions[:,1]

In [126]:
y_test_log_prob

array([0.78822435, 0.6743898 , 0.80751063, ..., 0.77916255, 0.74137584,
       0.55280047])

In [127]:
np.savetxt("preds_tfidflogreg_prob.csv", pd.DataFrame(y_test_log_prob))

**Perceptrón**

In [128]:
from sklearn.linear_model import Perceptron

model=Perceptron(tol=1e-3, random_state=0)
print(cross_val_score(model, X, y, cv=5,scoring='f1'))

[0.68527132 0.69300912 0.69923664 0.69327421 0.69353612]


In [129]:
y_pred_per=cross_val_predict(model, X, y, cv=10)
f1score = f1_score(tweets['target'], y_pred_per)
print(f'Counts model score: {f1score}')

Counts model score: 0.6941643751909563


In [130]:
pd.crosstab(tweets['target'],y_pred_per)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3339,1003
1,999,2272


In [131]:
y_pred_per

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [132]:
model.fit(X, y)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [133]:
test_perceptron_predictions = model.predict(X_test)

In [134]:
tweets_test['lin_pred'] = test_lin_predictions
submission = tweets_test.loc[:,['id','lin_pred']]
submission.head(10)

Unnamed: 0,id,lin_pred
0,0,0.789428
1,2,0.575369
2,3,0.568077
3,9,0.83326
4,11,0.642
5,12,1.126309
6,21,-0.052035
7,22,0.146149
8,27,-0.246498
9,29,-0.288499


In [135]:
submission.to_csv(path_or_buf='preds_perceptron.csv',header=True,index=False)

**Regresión lineal**

In [136]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [137]:
print(cross_val_score(model, X, y, cv=5))

[ -26.39675499  -45.62558326  -37.26173536  -21.75104382 -104.5037961 ]


In [138]:
y_pred_lin=cross_val_predict(model, X, y, cv=5)

In [139]:
#list(y_pred_lin)

In [140]:
y_pred_lin = y_pred_lin >=0.5
        
f1score = f1_score(tweets['target'], y_pred_lin)
print(f'Counts model score: {f1score}')

Counts model score: 0.5732684602175157


In [141]:
pd.crosstab(tweets['target'],y_pred_lin)

col_0,False,True
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2628,1714
1,1268,2003


In [142]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [143]:
test_lin_predictions = model.predict(X_test)
#list(test_lin_predictions)

In [144]:
tweets_test['lin_pred'] = test_lin_predictions
submission = tweets_test.loc[:,['id','lin_pred']]
submission.head(10)

Unnamed: 0,id,lin_pred
0,0,0.789428
1,2,0.575369
2,3,0.568077
3,9,0.83326
4,11,0.642
5,12,1.126309
6,21,-0.052035
7,22,0.146149
8,27,-0.246498
9,29,-0.288499


In [145]:
submission.to_csv(path_or_buf='preds_linreg.csv',header=True,index=False)

**Promediando resultados**

In [146]:
y_p = (y_pred_log + y_pred_per + y_pred_lin)/3
y_p = y_p >=0.5
y_p

array([False, False, False, ...,  True, False, False])

In [147]:
f1score = f1_score(tweets['target'], y_p)
print(f'Counts model score: {f1score}')

Counts model score: 0.7221271963924739


In [148]:
#no sirve, es mejor la regresión logística sola