In [1]:
import numpy as np
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_validate
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgbm
import xgboost as xgb

In [2]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
sample = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
train.shape, test.shape

((7613, 5), (3263, 4))

In [3]:
test['target'] = [np.nan for i in range(test.shape[0])]
test.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,
1,2,,,"Heard about #earthquake is different cities, s...",
2,3,,,"there is a forest fire at spot pond, geese are...",
3,9,,,Apocalypse lighting. #Spokane #wildfires,
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,


In [4]:
df = pd.concat([train, test]).set_index("id")
df.shape

(10876, 4)

In [5]:
def FillNA(df, cols):
    df[cols] = df[cols].fillna("9999999999999999")
    return df

df = FillNA(df, ['keyword', 'location'])
df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,9999999999999999,9999999999999999,Our Deeds are the Reason of this #earthquake M...,1.0
4,9999999999999999,9999999999999999,Forest fire near La Ronge Sask. Canada,1.0
5,9999999999999999,9999999999999999,All residents asked to 'shelter in place' are ...,1.0
6,9999999999999999,9999999999999999,"13,000 people receive #wildfires evacuation or...",1.0
7,9999999999999999,9999999999999999,Just got sent this photo from Ruby #Alaska as ...,1.0


In [6]:
df['all_text'] = df.keyword + " " + df.location + " " + df.text
df.head()

Unnamed: 0_level_0,keyword,location,text,target,all_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,9999999999999999,9999999999999999,Our Deeds are the Reason of this #earthquake M...,1.0,9999999999999999 9999999999999999 Our Deeds ar...
4,9999999999999999,9999999999999999,Forest fire near La Ronge Sask. Canada,1.0,9999999999999999 9999999999999999 Forest fire ...
5,9999999999999999,9999999999999999,All residents asked to 'shelter in place' are ...,1.0,9999999999999999 9999999999999999 All resident...
6,9999999999999999,9999999999999999,"13,000 people receive #wildfires evacuation or...",1.0,"9999999999999999 9999999999999999 13,000 peopl..."
7,9999999999999999,9999999999999999,Just got sent this photo from Ruby #Alaska as ...,1.0,9999999999999999 9999999999999999 Just got sen...


In [7]:
df = df.drop(['keyword','location','text'], axis=1)
df.head()

Unnamed: 0_level_0,target,all_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,9999999999999999 9999999999999999 Our Deeds ar...
4,1.0,9999999999999999 9999999999999999 Forest fire ...
5,1.0,9999999999999999 9999999999999999 All resident...
6,1.0,"9999999999999999 9999999999999999 13,000 peopl..."
7,1.0,9999999999999999 9999999999999999 Just got sen...


In [8]:
df['all_text'] = [word.replace("%20", " ") for word in df.all_text]
df.tail()

Unnamed: 0_level_0,target,all_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10861,,9999999999999999 9999999999999999 EARTHQUAKE S...
10865,,9999999999999999 9999999999999999 Storm in RI ...
10868,,9999999999999999 9999999999999999 Green Line d...
10874,,9999999999999999 9999999999999999 MEG issues H...
10875,,9999999999999999 9999999999999999 #CityofCalga...


In [9]:
words = []
p_dict = {i: " " for i in punctuation}

for text in df.all_text:
    # convert to lower case and replacing punctuation
    for k,v in p_dict.items():
        text = text.lower().replace(k, v)
        # tokenizing
    text = word_tokenize(text)
    # removing numerical or alpha numeric words
    text = [PorterStemmer().stem(WordNetLemmatizer().lemmatize(word)) for word in text
            if not word in stopwords.words('english')
            and word.isalpha()
            and len(word) > 1]
    words.append(" ".join(text).strip())
print(len(np.unique(words)), np.unique(words))

10412 [''
 'ablaz abuja noch el bestia alexi sanchez happi see teammat train hard goodnight gunner http co'
 'ablaz africa africanbaz break news nigeria flag set ablaz aba http co'
 ... 'wreckag wreckag conclus confirm malaysia pm investig famili http co'
 'wreckag wreckag najib http co najibrazak malaysiaairlin'
 'wreckag xi china wreckag conclus confirm miss flight via yahoonewsdigest']


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
idf = TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1,2))
words = idf.fit_transform(words).toarray()
words.shape

(10876, 9923)

In [11]:
df_word = pd.DataFrame(words, columns=idf.get_feature_names(), index=df.index)
df_word.head()

Unnamed: 0_level_0,aa,ab,ab canada,aba,aba woman,abandon,abandon aircraft,abandon lrt,abbott,abbswinston,...,zone come,zone http,zone war,zouma,zouma flatten,åá,åè,åê,ìàekdar,ìït
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_word.sum(axis=0).sort_values(ascending=False).head(20)

co         420.004298
http       417.676164
http co    417.253468
fire       128.776501
bomb       101.965755
like        94.725410
burn        90.162470
co http     89.406927
new         85.759653
scream      84.141474
get         79.312551
amp         75.185849
emerg       74.145255
drown       70.789041
fatal       70.690100
flood       70.654305
obliter     69.094940
disast      68.790081
build       66.954105
evacu       66.421410
dtype: float64

In [13]:
df_word = df_word.drop(['co', 'http', 'target'], axis=1)
df_word.sum(axis=0).sort_values(ascending=False).head(1000)

http co         417.253468
fire            128.776501
bomb            101.965755
like             94.725410
burn             90.162470
                   ...    
gbbo              6.766001
charact           6.756549
deton sensor      6.756436
ignit knock       6.756436
knock deton       6.756436
Length: 1000, dtype: float64

In [14]:
df_final = pd.concat([df, df_word], axis=1).drop(['all_text'], axis=1)
df_final.head()

Unnamed: 0_level_0,target,aa,ab,ab canada,aba,aba woman,abandon,abandon aircraft,abandon lrt,abbott,...,zone come,zone http,zone war,zouma,zouma flatten,åá,åè,åê,ìàekdar,ìït
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test = df_final[df_final.target.isna()].drop(['target'], axis=1)
test.head()

Unnamed: 0_level_0,aa,ab,ab canada,aba,aba woman,abandon,abandon aircraft,abandon lrt,abbott,abbswinston,...,zone come,zone http,zone war,zouma,zouma flatten,åá,åè,åê,ìàekdar,ìït
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
X = df_final[~df_final.target.isna()].drop(['target'], axis=1)
y = df_final[~df_final.target.isna()].target
X.shape, y.shape

((7613, 9920), (7613,))

In [17]:
lr = LogisticRegression()
param_lr = {'fit_intercept': [True, False],
         'random_state': [1,2,3]}

rf = RandomForestClassifier(random_state=123)
param_rf = {#'n_estimators': np.arange(3,20).tolist(),
            'criterion': ['gini','entropy'],
            'max_depth':np.arange(2,20).tolist()}

dt = DecisionTreeClassifier(random_state=234)
param_dt = {'criterion': ['gini','entropy'],
            'max_depth':np.arange(1,10).tolist()}

lgbm = lgbm.LGBMClassifier(objective = 'binary')
param_lgbm = {'learning_rate': [0.01, 0.03, 0.05, 0.1],
            'n_estimator': np.arange(5,10).tolist(),
            'num_leaves':np.arange(3,15).tolist(),
             'reg_alpha': [0.01, 0.02, 0.03]}

xgb = xgb.XGBClassifier()
param_xgb = {'learning_rate': [0.001, 0.01, 0.03, 0.1],
            'n_estimator': np.arange(5,10).tolist(),
            'max_depth':np.arange(3,15).tolist(),
             'alpha': [1, 2, 3]}

In [18]:
def GSCV(X, y, test, est, model, params, sample, cv=5):
    grid = GridSearchCV(estimator=est, 
                        param_grid=params,
                       scoring='f1_micro')
    
    name = "/kaggle/working/" + model
    grid.fit(X, y)
    submission = pd.DataFrame(grid.predict(test), columns=['target'], index=test.index)
    submission['target'] = submission['target'].astype('int64')
    submission.to_csv(name +".csv")
    
    with open(name + ".pickle", 'wb') as handle:
        pickle.dump(grid.best_estimator_, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return grid.best_score_, grid.best_params_

In [19]:
score, model = GSCV(X, y, test, lr, 'logr', param_lr, sample, 'f1_score')
print(f"Score: {score}, Model: {model}")

Score: 0.645352600467816, Model: {'fit_intercept': True, 'random_state': 1}
