In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import re
import string

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub('\s+', ' ', text).strip()
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [4]:
df_train['keyword'].replace(np.nan,'',inplace=True)
df_train['location'].replace(np.nan,'',inplace=True)

In [5]:
df_test['keyword'].replace(np.nan,'',inplace=True)
df_test['location'].replace(np.nan,'',inplace=True)

In [6]:
df_train['combined_text'] = df_train['text'] +' '+df_train['keyword']+' '+df_train['location']
df_test['combined_text'] = df_test['text'] + ' ' + df_test['keyword'] + ' ' + df_test['location']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))

# Fit and transform the train data, and transform the test data
X_train = vectorizer.fit_transform(df_train['combined_text'])
X_test = vectorizer.transform(df_test['combined_text'])

# Extract target variable
y_train = df_train['target']

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the training data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


parameters ={'C':[0.01,0.1,1],
             'penalty':['l2', 'l1', 'elasticnet'],
             'solver':['lbfgs','liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
cv_model = GridSearchCV(model, parameters)
cv_model.fit(X_train_split,y_train_split)

# Predict on the validation set
y_val_pred = cv_model.predict(X_val_split)

# Evaluate the model
print('Validation Accuracy:', accuracy_score(y_val_split, y_val_pred))
print(classification_report(y_val_split, y_val_pred))

Validation Accuracy: 0.8017071569271176
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.81      0.70      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



150 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Petr\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Petr\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Petr\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users

In [12]:
predictions = cv_model.predict(X_test)

In [13]:
submission = pd.DataFrame({'id': df_test['id'],'target':predictions})

In [14]:
submission.to_csv('submission_ML.csv', index=False)