In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re



# Data processing

In [2]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_train.drop(columns=['location', 'keyword'], inplace=True)

df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df_test.drop(columns=['location', 'keyword'], inplace=True)


In [3]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)

    return text

def remove_html(text):
    text = re.sub(r'<.*?>',' ',text)
    return text

def remove_mentions(text):
    text = re.sub('@\S+', '', text)
    return text

df_train['text'] = df_train['text'].apply(lambda x : remove_emoji(x))
df_train['text'] = df_train['text'].apply(lambda x : remove_url(x))
df_train['text'] = df_train['text'].apply(lambda x : seperate_alphanumeric(x))
df_train['text'] = df_train['text'].apply(lambda x : decontraction(x))
df_train['text'] = df_train['text'].apply(lambda x : remove_html(x))
df_train['text'] = df_train['text'].apply(lambda x : remove_mentions(x))
df_train['text'] = df_train['text'].apply(lambda x : x.lower())

df_test['text'] = df_test['text'].apply(lambda x : remove_emoji(x))
df_test['text'] = df_test['text'].apply(lambda x : remove_url(x))
df_test['text'] = df_test['text'].apply(lambda x : seperate_alphanumeric(x))
df_test['text'] = df_test['text'].apply(lambda x : decontraction(x))
df_test['text'] = df_test['text'].apply(lambda x : remove_html(x))
df_test['text'] = df_test['text'].apply(lambda x : remove_mentions(x))
df_test['text'] = df_test['text'].apply(lambda x : x.lower())

In [4]:
text = pd.concat([df_train['text'], df_test['text']], axis=0)

tfidf_vectorizer = TfidfVectorizer(
    min_df=2,
    stop_words='english',
    ngram_range=(1, 2)
)

vectors = tfidf_vectorizer.fit_transform(
    text
)

train_target = df_train['target']
test_ids = df_test['id']

In [5]:
vectors = pd.DataFrame(vectors.toarray())

train = vectors.iloc[:7613, :]
test = vectors.iloc[7613:, :]


# Stacking classifier

In [6]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Splitting train set for two separate sets 

In [7]:
train1 = train.iloc[:5000, :]
train2 = train.iloc[5000:, :]
train_target1 = train_target[:5000]
train_target2 = train_target[5000:]

### Training first layer of classifiers - all of them must be able to return probability

#### Multinomial Naive Bayes

In [8]:
mnb = MultinomialNB()

param_grid = {
    'alpha': [0.1, 0.5, 1.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(mnb, param_grid, cv=5, scoring='f1')
grid_search.fit(train1, train_target1)
mnb = grid_search.best_estimator_

#### Bernoulli Naive Bayes

In [9]:
bnb = BernoulliNB()

param_grid = {
    'alpha': [0.1, 0.5, 1.0],
    'binarize': [0.0, 0.5, 1.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(bnb, param_grid, cv=5, scoring='f1')
grid_search.fit(train1, train_target1)
bnb = grid_search.best_estimator_

#### Random Forest

In [None]:
forest = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(forest, param_grid, cv=5, scoring='f1')
grid_search.fit(train1, train_target1)
forest = grid_search.best_estimator_

#### Logistic regression

In [None]:
log_clf = LogisticRegression()

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

grid_search = GridSearchCV(log_clf, param_grid, cv=5, scoring='f1')
grid_search.fit(train1, train_target1)
log_clf = grid_search.best_estimator_

#### KNeighbors Classifier

In [None]:
knc = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

grid_search = GridSearchCV(knc, param_grid, cv=5, scoring='f1')
grid_search.fit(train1, train_target1)
knc = grid_search.best_estimator_

### Creating new train set for second layer (blender) using predictions from first layer classifiers

In [None]:
predictions_mnb = pd.DataFrame(mnb.predict_proba(train2))
predictions_mnb.set_index(train2.index, inplace=True)

predictions_bnb = pd.DataFrame(bnb.predict_proba(train2))
predictions_bnb.set_index(train2.index, inplace=True)

predictions_forest = pd.DataFrame(forest.predict_proba(train2))
predictions_forest.set_index(train2.index, inplace=True)

predictions_log = pd.DataFrame(log_clf.predict_proba(train2))
predictions_log.set_index(train2.index, inplace=True)

predictions_knc = pd.DataFrame(knc.predict_proba(train2))
predictions_knc.set_index(train2.index, inplace=True)

train_blender = pd.DataFrame({
    'mnb': predictions_mnb[1],
    'bnb': predictions_bnb[1],
    'forest': predictions_forest[1],
    'log': predictions_log[1],
    'knc': predictions_knc[1]
}, index=train2.index)

train_blender

### Training blender

In [None]:
svc = SVC()

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1]
}

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='f1')
grid_search.fit(train_blender, train_target2)

blender = grid_search.best_estimator_

### Final predictions for test set

In [None]:
final_predictions = blender.predict(
    pd.DataFrame({
        'mnb': [one_proba for zero_proba, one_proba in mnb.predict_proba(test)],
        'bnb': [one_proba for zero_proba, one_proba in bnb.predict_proba(test)],
        'forest': [one_proba for zero_proba, one_proba in forest.predict_proba(test)],
        'log': [one_proba for zero_proba, one_proba in log_clf.predict_proba(test)],
        'knc': [one_proba for zero_proba, one_proba in knc.predict_proba(test)],
    }, index=test_ids)
)
final_predictions

In [None]:
final_predictions = pd.DataFrame({'target': final_predictions}, index=test_ids)
final_predictions.to_csv('stacking-classifier.csv')
final_predictions