In [1]:
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Шаг 1 Удаляем стоп-слова

In [4]:
stop_words = stopwords.words('english')
def del_stop(text, stop_words):
    tokenizer = TweetTokenizer() 
    token_list = tokenizer.tokenize(text.lower())
    return ' '.join([token for token in token_list if token not in stop_words])

train['text_1'] = train['text'].apply(lambda x: del_stop(x, stop_words))
test['text_1'] = test['text'].apply(lambda x: del_stop(x, stop_words))

## Шаг 2 Лемматизация

In [5]:
def lemmatize_text(text):
    tokenizer = TweetTokenizer() 
    token_list = tokenizer.tokenize(text.lower())
    lmtz = WordNetLemmatizer()
    return ' '.join([lmtz.lemmatize(token,'v') for token in token_list])

train['text_2'] = train['text_1'].apply(lambda x: lemmatize_text(x))
test['text_2'] = test['text_1'].apply(lambda x: lemmatize_text(x))

## Шаг 3 Удаляем пунктуацию

In [6]:
def del_punc(text, punctuation):
    tokenizer = TweetTokenizer() 
    token_list = tokenizer.tokenize(text.lower())
    return ' '.join([token for token in token_list if token not in punctuation])    

train['text_3'] = train['text_2'].apply(lambda x: del_punc(x, punctuation))
test['text_3'] = test['text_2'].apply(lambda x: del_punc(x, punctuation))

## Шаг 4 Токенизация

In [7]:
def get_tokens(text):
    tokenizer = TweetTokenizer() 
    token_list = tokenizer.tokenize(text.lower())
    return token_list

train['tokens'] = train['text_3'].apply(lambda x: get_tokens(x))
test['tokens'] = test['text_3'].apply(lambda x: get_tokens(x))

## Шаг 5 TF-IDF

In [98]:
word_vectorizer = TfidfVectorizer(max_df=0.99, min_df=0.003, max_features=30000, sublinear_tf=True)
word_vectorizer.fit(train['text_3'])

x_train = word_vectorizer.transform(train['text_3'])
print(x_train.shape)

x_test = word_vectorizer.transform(test['text_3'])
print(x_test.shape)

(7613, 648)
(3263, 648)


In [90]:
l = list(tfidf.vocabulary_.items())
l = sorted(l, reverse=True) 
voc = dict(l)
print(len(voc))
voc

1613


{'ûó': 1612,
 'ûò': 1611,
 'ûïwhen': 1610,
 'ûï': 1609,
 'ûªt': 1608,
 'ûªs': 1607,
 'ûªm': 1606,
 'ûª': 1605,
 'û_': 1604,
 'åê': 1603,
 'zone': 1602,
 'zombie': 1601,
 'zero': 1600,
 'yyc': 1599,
 'youtube': 1598,
 'young': 1597,
 'york': 1596,
 'yo': 1595,
 'yet': 1594,
 'yesterday': 1593,
 'yes': 1592,
 'yemen': 1591,
 'years': 1590,
 'year': 1589,
 'yeah': 1588,
 'yazidis': 1587,
 'yard': 1586,
 'ya': 1585,
 'wwii': 1584,
 'wtf': 1583,
 'wrong': 1582,
 'write': 1581,
 'wreckage': 1580,
 'wreck': 1579,
 'wrap': 1578,
 'wow': 1577,
 'wound': 1576,
 'would': 1575,
 'worth': 1574,
 'worst': 1573,
 'worse': 1572,
 'worry': 1571,
 'world': 1570,
 'worker': 1569,
 'work': 1568,
 'word': 1567,
 'wonder': 1566,
 'womens': 1565,
 'women': 1564,
 'woman': 1563,
 'wmata': 1562,
 'without': 1561,
 'wish': 1560,
 'wire': 1559,
 'wing': 1558,
 'windstorm': 1557,
 'wind': 1556,
 'win': 1555,
 'wildfires': 1554,
 'wildfire': 1553,
 'wild': 1552,
 'wife': 1551,
 'wht': 1550,
 'whole': 1549,
 'white

In [101]:
x_train

<7613x648 sparse matrix of type '<class 'numpy.float64'>'
	with 42046 stored elements in Compressed Sparse Row format>

## Шаг 6 Логистическая регрессия

In [99]:
from sklearn.linear_model import LogisticRegression

In [102]:
model = LogisticRegression(C=1.0, n_jobs=-1)
model.fit(x_train, train['target'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [103]:
model.predict(x_test)

array([1, 1, 1, ..., 1, 1, 0])

## Шаг 7 Кросс-Валидация

In [110]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(train.drop('target',axis=1), train['target'], test_size=0.33, random_state=42)
xtrain.shape, xtest.shape

((5100, 8), (2513, 8))

In [147]:
word_vectorizer = TfidfVectorizer(max_df=0.99, min_df=5, ngram_range=(1,3), max_features=10000, sublinear_tf=True)
word_vectorizer.fit(train['text_3'])

x_train = word_vectorizer.transform(xtrain['text_3'])
print(x_train.shape)

x_test = word_vectorizer.transform(xtest['text_3'])
print(x_test.shape)

(5100, 4506)
(2513, 4506)


In [148]:
model = LogisticRegression(C=1, n_jobs=-1)
model.fit(x_train, ytrain)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [149]:
from sklearn.metrics import accuracy_score
accuracy_score(model.predict(x_test), ytest)

0.8062077198567449

In [140]:
answer_test = word_vectorizer.transform(test['text_3'])
answer = pd.read_csv('sample_submission.csv')
answer['target'] = model.predict(answer_test)
answer.to_csv('answer.csv', index=False)