In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import spacy 
import string

In [None]:
# Import the data
df_train = pd.read_csv('https://raw.githubusercontent.com/zsgithub2/Hublot-project/main/Data/training_data.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/zsgithub2/Hublot-project/main/Data/test_data.csv')


# create a new feature that also comprehend the keyword
df_train['key_text'] = df_train['keyword'] + ' ' + df_train['text'] 
df_train['key_text'] = df_train['key_text'].astype(str)
df_test['key_text'] = df_test['keyword'] + ' ' + df_test['text'] 
df_test['key_text'] = df_test['key_text'].astype(str)

# Define tokenizer cleaner
sp = spacy.load('en_core_web_sm') 
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS 
punctuations = string.punctuation


# Tokenizer, I did it my self, i didn't used the pipeline we saw in class,
# the reason is that I want the dataframe that comes as an output, if ever I want to modify it
def tokenizer(message):
    filtered_message = []  
    for sentence in message:
        processd_message = []
        sentence = sp(sentence)
        for word in sentence:
            if str(word) not in punctuations:
                if (word.is_stop == False) and (word.is_space == False):
                    processd_message.append(word.lemma_.lower())
        filtered_message.append(processd_message)

    all_text = []
    for text_list in filtered_message:
        all_text.append(""" """.join(text_list))

    count = CountVectorizer(ngram_range=(1,1), stop_words="english")
    bow = count.fit_transform(all_text)
    feature_names = count.get_feature_names()
    df_final = pd.DataFrame(bow.todense(), columns=feature_names)

    return df_final

In [None]:
# 80/20 Test (Run this to get an Idea of how good is the program)
X_train, X_test, y_train, y_test = train_test_split(tokenizer(df_train['key_text']), df_train['target'], test_size=0.2, random_state=72)

classifier = LogisticRegressionCV(solver="lbfgs",cv = 5, max_iter=1000, random_state=72)
classifier.fit(X_train,y_train)
pred = classifier.predict(X_test)
print('You have to beat: ' + str(accuracy_score(y_test, pred)))

## previous results:

The first submission we had an accuracy of (about) 0.794. Our final score was 0.81
The first submission we had an accuracy of  0.80077. Our final score was 0.823

 

In [None]:
# creating the dataframe
tokens = tokenizer(pd.concat([df_train, df_test])['key_text'])

# preparing the variables for the submission
X_train = tokens[:len(df_train)]
y_train = df_train['target']
X_test = tokens[len(df_train):]

# making the prediction
classifier = LogisticRegressionCV(solver="lbfgs", cv = 5, max_iter=1000, random_state=72)
classifier.fit(X_train,y_train)
pred = classifier.predict(X_test)

# saving the prediction in csv (P.S. run this locally, I don't know if it works on colab)
df = pd.DataFrame(pred, columns=['target'])
df.to_csv('submission2.csv', index = False)