<a href="https://colab.research.google.com/github/takakishi/HEC_DS_ML_project/blob/main/src/t_logit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup and Import Data

In [7]:
# !pip install spacy
# !python -m spacy download fr_core_news_sm
!pip install textstat
# BERT
# !pip install accelerate -U

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [8]:
# Libraries
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# Training and further analysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import spacy
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stopwords
# BERT
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# import torch
# from transformers import Trainer, TrainingArguments
# GridSearchCV for hyperparameter tuning of the logistic regression
from sklearn.model_selection import GridSearchCV
# Check if SVM model performs better
from sklearn.svm import SVC
# Further
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
import textstat
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import TruncatedSVD

In [4]:
# Import Data
sample_submission = pd.read_csv('https://raw.githubusercontent.com/takakishi/HEC_DS_ML_project/main/data/data_raw/sample_submission.csv')
training_data = pd.read_csv('https://raw.githubusercontent.com/takakishi/HEC_DS_ML_project/main/data/data_raw/training_data.csv')
unlabelled_test_data = pd.read_csv('https://raw.githubusercontent.com/takakishi/HEC_DS_ML_project/main/data/data_raw/unlabelled_test_data.csv')

In [None]:
sample_submission.head

<bound method NDFrame.head of         id difficulty
0        0         A1
1        1         A1
2        2         A1
3        3         A1
4        4         A1
...    ...        ...
1195  1195         A1
1196  1196         A1
1197  1197         A1
1198  1198         A1
1199  1199         A1

[1200 rows x 2 columns]>

In [None]:
training_data.head

<bound method NDFrame.head of         id                                           sentence difficulty
0        0  Les coûts kilométriques réels peuvent diverger...         C1
1        1  Le bleu, c'est ma couleur préférée mais je n'a...         A1
2        2  Le test de niveau en français est sur le site ...         A1
3        3           Est-ce que ton mari est aussi de Boston?         A1
4        4  Dans les écoles de commerce, dans les couloirs...         B1
...    ...                                                ...        ...
4795  4795  C'est pourquoi, il décida de remplacer les hab...         B2
4796  4796  Il avait une de ces pâleurs splendides qui don...         C1
4797  4797  Et le premier samedi de chaque mois, venez ren...         A2
4798  4798  Les coûts liés à la journalisation n'étant pas...         C2
4799  4799  Sur le sable, la mer haletait de toute la resp...         C2

[4800 rows x 3 columns]>

In [None]:
unlabelled_test_data.head

<bound method NDFrame.head of         id                                           sentence
0        0  Nous dûmes nous excuser des propos que nous eû...
1        1  Vous ne pouvez pas savoir le plaisir que j'ai ...
2        2  Et, paradoxalement, boire froid n'est pas la b...
3        3  Ce n'est pas étonnant, car c'est une saison my...
4        4  Le corps de Golo lui-même, d'une essence aussi...
...    ...                                                ...
1195  1195  C'est un phénomène qui trouve une accélération...
1196  1196  Je vais parler au serveur et voir si on peut d...
1197  1197  Il n'était pas comme tant de gens qui par pare...
1198  1198      Ils deviennent dangereux pour notre économie.
1199  1199  Son succès a généré beaucoup de réactions néga...

[1200 rows x 2 columns]>

# Functions and Data Splitting

In [5]:
# 1. Basic Preprocessing (Logit: 0.44, RF: 0.383)

def preprocess_basic(text):
    text = text.lower()  # Convert to lower case
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Apply Basic Preprocessing
training_data['processed_sentence_basic'] = training_data['sentence'].apply(preprocess_basic)

# Initialize and Fit TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_basic = tfidf_vectorizer.fit_transform(training_data['processed_sentence_basic'])
y_basic = training_data['difficulty']

# Split Basic Preprocessed Data
X_train_basic, X_val_basic, y_train_basic, y_val_basic = train_test_split(X_basic, y_basic, test_size=0.2, random_state=42)

In [None]:
# 2. Tokenization and Stopword Removal (0.405, RF: 0.362)
def preprocess_with_stopwords(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

training_data['processed_sentence_stopwords'] = training_data['sentence'].apply(preprocess_with_stopwords)

X_stopwords = tfidf_vectorizer.fit_transform(training_data['processed_sentence_stopwords'])
y_stopwords = training_data['difficulty']

X_train_stopwords, X_val_stopwords, y_train_stopwords, y_val_stopwords = train_test_split(X_stopwords, y_stopwords, test_size=0.2, random_state=42)

In [None]:
# 3. Lemmatization and POS Tagging
def preprocess_with_lemmatization_pos(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    pos_tags = [token.pos_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens), pos_tags

# Apply lemmatization and POS tagging preprocessing
training_data['processed_sentence_lemmatization_pos'] = training_data['sentence'].apply(lambda x: preprocess_with_lemmatization_pos(x)[0])

# Fit and transform with TF-IDF
X_lemmatization_pos = tfidf_vectorizer.fit_transform(training_data['processed_sentence_lemmatization_pos'])
y_lemmatization_pos = training_data['difficulty']

# Split the data
X_train_lemmatization_pos, X_val_lemmatization_pos, y_train_lemmatization_pos, y_val_lemmatization_pos = train_test_split(X_lemmatization_pos, y_lemmatization_pos, test_size=0.2, random_state=42)

In [None]:
# 4. Advanced Preprocessing with NER and Dependency Parsing
def preprocess_advanced(text):
    doc = nlp(text.lower())
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.ent_type_:
            continue
        tokens.append(token.lemma_)
    ner = [ent.label_ for ent in doc.ents]
    dep_parse = [(token.text, token.dep_) for token in doc]
    return ' '.join(tokens), ner, dep_parse

# Apply advanced preprocessing
training_data['processed_sentence_advanced'] = training_data['sentence'].apply(lambda x: preprocess_advanced(x)[0])

# Fit and transform with TF-IDF
X_advanced = tfidf_vectorizer.fit_transform(training_data['processed_sentence_advanced'])
y_advanced = training_data['difficulty']

# Split the data
X_train_advanced, X_val_advanced, y_train_advanced, y_val_advanced = train_test_split(X_advanced, y_advanced, test_size=0.2, random_state=42)

In [None]:
# 5.
nlp = spacy.load('fr_core_news_sm')

def preprocess_with_entity_exclusion(text):
    # Process the text with spaCy
    doc = nlp(text.lower())

    # Tokenize and apply advanced preprocessing
    tokens = []
    for token in doc:
        # Skip stopwords and punctuation
        if token.is_stop or token.is_punct:
            continue

        # Append lemmatized form of the token if it's not a named entity
        if token.ent_type_ == '':
            tokens.append(token.lemma_)

    # Reconstruct text
    return ' '.join(tokens)

# Apply preprocessing with entity exclusion
training_data['processed_sentence_entity_exclusion'] = training_data['sentence'].apply(preprocess_with_entity_exclusion)

# Fit and transform with TF-IDF
X_entity_exclusion = tfidf_vectorizer.fit_transform(training_data['processed_sentence_entity_exclusion'])
y_entity_exclusion = training_data['difficulty']

# Split the data
X_train_entity_exclusion, X_val_entity_exclusion, y_train_entity_exclusion, y_val_entity_exclusion = train_test_split(X_entity_exclusion, y_entity_exclusion, test_size=0.2, random_state=42)


# Split the data

In [None]:
X_train.shape, X_val.shape

((3840, 14275), (960, 14275))

# Logit (with Hyperparameter Tuning), etc

In [9]:
# 1. Basic Preprocessing

# 1-1. Simplest ----
log_reg_basic = LogisticRegression(random_state=42)
log_reg_basic.fit(X_train_basic, y_train_basic)
y_val_pred_basic = log_reg_basic.predict(X_val_basic)
accuracy_basic = accuracy_score(y_val_basic, y_val_pred_basic)
print("Accuracy with Basic Preprocessing:", accuracy_basic)

# Hyperparameter Tuning for Logistic Regression (Best Score: 0.453125) ----
# Hyperparameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
# Logistic regression model
log_reg = LogisticRegression(random_state=42, solver='liblinear')
# Grid search
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_basic, y_train_basic)
# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


# 1-2. Support Vector Machine ----
svm_model = SVC(kernel='linear')
# Train the model
svm_model.fit(X_train_basic, y_train_basic)
# Evaluate the model
y_val_pred_svm = svm_model.predict(X_val_basic)
accuracy_svm = accuracy_score(y_val_basic, y_val_pred_svm)
print("Accuracy with SVM:", accuracy_svm)


# 1-3. Feature Engineering 1 - Adding Sentence Length (0.475) ----
training_data['sentence_length'] = training_data['sentence'].apply(lambda x: len(x.split()))
scaler = StandardScaler()
length_scaled = scaler.fit_transform(training_data[['sentence_length']])

# Add sentence length feature to TF-IDF features
X_with_length_basic = hstack((X_basic, length_scaled))

# Split the data with the new feature
X_train_length_basic, X_val_length_basic, y_train_length_basic, y_val_length_basic = train_test_split(X_with_length_basic, y_basic, test_size=0.2, random_state=42)

# Train logistic regression with the new feature
log_reg_length_basic = LogisticRegression(random_state=42, max_iter=1000)
log_reg_length_basic.fit(X_train_length_basic, y_train_length_basic)

# Evaluate the model
y_val_pred_length_basic = log_reg_length_basic.predict(X_val_length_basic)
accuracy_length_basic = accuracy_score(y_val_length_basic, y_val_pred_length_basic)
print("Accuracy with Sentence Length Feature for Basic Preprocessing:", accuracy_length_basic)


# 1-4. Feature Engineering 2 - Calculate readability score (0.4729) ----
training_data['readability_score'] = training_data['processed_sentence_basic'].apply(textstat.flesch_reading_ease)
readability_scaled = scaler.fit_transform(training_data[['readability_score']])

# Combine readability score with TF-IDF and sentence length features
X_with_readability = hstack((X_with_length_basic, readability_scaled))

# Split the data
X_train_readability, X_val_readability, y_train_readability, y_val_readability = train_test_split(X_with_readability, y_basic, test_size=0.2, random_state=42)

# Train and evaluate logistic regression
log_reg_readability = LogisticRegression(random_state=42, max_iter=1000)
log_reg_readability.fit(X_train_readability, y_train_readability)
y_val_pred_readability = log_reg_readability.predict(X_val_readability)
accuracy_readability = accuracy_score(y_val_readability, y_val_pred_readability)
print("Accuracy with Readability Feature:", accuracy_readability)


# 1-5 Feature Engineering 2 (0.401) ----
# Create an imputer object with a strategy of replacing NaN values with the mean of the column
# Standardize the additional features
scaler = StandardScaler()
length_scaled = scaler.fit_transform(training_data[['sentence_length']])
readability_scaled = scaler.fit_transform(training_data[['readability_score']])

# Combine TF-IDF features with the additional features
X_combined = hstack([X_basic, length_scaled, readability_scaled])

# Split into training and validation sets
X_train_more_features, X_val_more_features, y_train_more_features, y_val_more_features = train_test_split(X_combined, y_basic, test_size=0.2, random_state=42)

# Create and apply the imputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_more_features)
X_val_imputed = imputer.transform(X_val_more_features)

# Train logistic regression with the imputed feature set
log_reg_imputed = LogisticRegression(random_state=42, max_iter=1000)
log_reg_imputed.fit(X_train_imputed, y_train_more_features)

# Evaluate the model
y_val_pred_imputed = log_reg_imputed.predict(X_val_imputed)
accuracy_imputed = accuracy_score(y_val_more_features, y_val_pred_imputed)
print("Accuracy with Imputed Feature Set:", accuracy_imputed)


# 1-6. Feature Engineering ----
# Reduce the dimensionality of the features
svd = TruncatedSVD(n_components=100)  # Adjust n_components based on your dataset
X_train_reduced = svd.fit_transform(X_train_more_features)
X_val_reduced = svd.transform(X_val_more_features)

# Now use the reduced feature set with the HistGradientBoostingClassifier
hist_gb_clf = HistGradientBoostingClassifier(random_state=42)
hist_gb_clf.fit(X_train_reduced, y_train_more_features)

# Evaluate the model
y_val_pred_hist_gb = hist_gb_clf.predict(X_val_reduced)
accuracy_hist_gb = accuracy_score(y_val_more_features, y_val_pred_hist_gb)
print("Accuracy with HistGradientBoostingClassifier and Reduced Features:", accuracy_hist_gb)

Accuracy with Basic Preprocessing: 0.440625




Best Parameters: {'C': 10, 'penalty': 'l2'}
Best Score: 0.453125
Accuracy with SVM: 0.44895833333333335
Accuracy with Sentence Length Feature for Basic Preprocessing: 0.475
Accuracy with Readability Feature: 0.47291666666666665
Accuracy with Imputed Feature Set: 0.47291666666666665
Accuracy with HistGradientBoostingClassifier and Reduced Features: 0.415625


In [None]:
# 2. Tokenization and Stopword Removal
# Logistic Regression for Tokenization and Stopword Removal
log_reg_stopwords = LogisticRegression(random_state=42)
log_reg_stopwords.fit(X_train_stopwords, y_train_stopwords)
y_val_pred_stopwords = log_reg_stopwords.predict(X_val_stopwords)
accuracy_stopwords = accuracy_score(y_val_stopwords, y_val_pred_stopwords)
print("Accuracy with Tokenization and Stopword Removal:", accuracy_stopwords)

# Hyperparameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
# Logistic regression model
log_reg = LogisticRegression(random_state=42, solver='liblinear')
# Grid search
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_stopwords, y_train_stopwords)
# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Accuracy with Tokenization and Stopword Removal: 0.40520833333333334




Best Parameters: {'C': 10, 'penalty': 'l2'}
Best Score: 0.39713541666666663


In [None]:
# 3. Lemmatization and POS Tagging
# Logistic Regression for Lemmatization and POS Tagging
log_reg_lemmatization_pos = LogisticRegression(random_state=42)
log_reg_lemmatization_pos.fit(X_train_lemmatization_pos, y_train_lemmatization_pos)
y_val_pred_lemmatization_pos = log_reg_lemmatization_pos.predict(X_val_lemmatization_pos)
accuracy_lemmatization_pos = accuracy_score(y_val_lemmatization_pos, y_val_pred_lemmatization_pos)
print("Accuracy with Lemmatization and POS Tagging:", accuracy_lemmatization_pos)

# Hyperparameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
# Logistic regression model
log_reg = LogisticRegression(random_state=42, solver='liblinear')
# Grid search
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_lemmatization_pos, y_train_lemmatization_pos)
# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Accuracy with Lemmatization and POS Tagging: 0.3875




Best Parameters: {'C': 10, 'penalty': 'l2'}
Best Score: 0.39322916666666663


In [None]:
# 4. Advanced Preprocessing with NER and Dependency Parsing
# Logistic Regression for Advanced Preprocessing with NER and Dependency Parsing
log_reg_advanced = LogisticRegression(random_state=42)
log_reg_advanced.fit(X_train_advanced, y_train_advanced)
y_val_pred_advanced = log_reg_advanced.predict(X_val_advanced)
accuracy_advanced = accuracy_score(y_val_advanced, y_val_pred_advanced)
print("Accuracy with Advanced Preprocessing (NER & Dependency Parsing):", accuracy_advanced)

# Hyperparameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
# Logistic regression model
log_reg = LogisticRegression(random_state=42, solver='liblinear')
# Grid search
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_advanced, y_train_advanced)
# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Accuracy with Advanced Preprocessing (NER & Dependency Parsing): 0.38958333333333334




Best Parameters: {'C': 10, 'penalty': 'l2'}
Best Score: 0.39010416666666664


In [None]:
# 5. Preprocessing with Entity Exclusion
# Logistic Regression for Preprocessing with Entity Exclusion
log_reg_entity_exclusion = LogisticRegression(random_state=42)
log_reg_entity_exclusion.fit(X_train_entity_exclusion, y_train_entity_exclusion)
y_val_pred_entity_exclusion = log_reg_entity_exclusion.predict(X_val_entity_exclusion)
accuracy_entity_exclusion = accuracy_score(y_val_entity_exclusion, y_val_pred_entity_exclusion)
print("Accuracy with Preprocessing with Entity Exclusion:", accuracy_entity_exclusion)

# Hyperparameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
# Logistic regression model
log_reg = LogisticRegression(random_state=42, solver='liblinear')
# Grid search
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_entity_exclusion, y_train_entity_exclusion)
# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Accuracy with Preprocessing with Entity Exclusion: 0.38958333333333334




Best Parameters: {'C': 10, 'penalty': 'l2'}
Best Score: 0.39010416666666664


# Random Forest

In [None]:
# Random Forest
rf_clf = RandomForestClassifier(random_state=42)

rf_clf.fit(X_train, y_train)

y_val_pred = rf_clf.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)

precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_val_pred, average='weighted')

conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

# BERT

In [None]:
# Load the BERT tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(training_data['difficulty'].unique()))