<a href="https://colab.research.google.com/github/wrmdx/SVM_SentimentAnalysis/blob/main/TP_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
import nltk

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
with open('/content/reviews.txt', 'r') as f:
    reviews = f.read()
with open('/content/labels.txt', 'r') as f:
    labels = f.read()

In [19]:
print(reviews[:2000])
print(labels[:100])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

In [20]:
# Supprimer la ponctuation
reviews = reviews.lower()
reviews = ''.join([c for c in reviews if c not in punctuation])

In [21]:
#Diviser les critiques en lignes individuelles
review_split = reviews.split('\n')

In [22]:
# Prétraitement du texte : supprimer les mots vides et appliquer la lemmatisation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [23]:
def preprocess_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

preprocessed_reviews = [preprocess_text(review) for review in review_split]

In [24]:
#Encoder les étiquettes

label_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in label_split])

In [25]:
# longueur cohérente entre les critiques et les étiquettes
min_len = min(len(preprocessed_reviews), len(encoded_labels))
preprocessed_reviews = preprocessed_reviews[:min_len]
encoded_labels = encoded_labels[:min_len]

In [27]:
# Transformer les critiques en utilisant TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(preprocessed_reviews).toarray()


In [28]:
# Normaliser les caractéristiques en supprimant la moyenne et en les mettant à l'échelle pour obtenir une variance unitaire
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, encoded_labels, test_size=0.2, random_state=42)


In [34]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

# SVC classifier
svm_svc = SVC(kernel='rbf', C=1, gamma='scale')

# Train SVC model
svm_svc.fit(X_train, y_train)

# Predict on test set
y_pred_svc = svm_svc.predict(X_test)

# Calculate accuracy
test_accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("Test Accuracy SVC with C=1:", test_accuracy_svc)


###################################


# SVC classifier
svm_svc = SVC(kernel='rbf', C=2, gamma='scale')

# Train SVC model
svm_svc.fit(X_train, y_train)

# Predict on test set
y_pred_svc = svm_svc.predict(X_test)

# Calculate accuracy
test_accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("Test Accuracy SVC with C=2:", test_accuracy_svc)


###################################


# Initialize LinearSVC
svm_linear_svc = LinearSVC(C=1)

# Train
svm_linear_svc.fit(X_train, y_train)

# Predict
y_pred_linear_svc = svm_linear_svc.predict(X_test)

# accuracy
test_accuracy_linear_svc = accuracy_score(y_test, y_pred_linear_svc)
print("Test Accuracy LinearSVC with C=1:", test_accuracy_linear_svc)

############################


# Initialize SVC
svm_svc = SVC(kernel='poly', C=1, gamma='scale')

# Train SVC model
svm_svc.fit(X_train, y_train)

# Predict
y_pred_svc = svm_svc.predict(X_test)

# accuracy
test_accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("Test Accuracy SVC with C=1:", test_accuracy_svc)


############################

# Initialize SVC
svm_svc = SVC(kernel='poly', C=2, gamma='scale')

# Train SVC model
svm_svc.fit(X_train, y_train)

# Predict
y_pred_svc = svm_svc.predict(X_test)

# accuracy
test_accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("Test Accuracy SVC with C=2:", test_accuracy_svc)


############################

# LinearSVC model

svm_linear_svc = LinearSVC(C=2)
svm_linear_svc.fit(X_train, y_train)

# Predict
y_pred_linear_svc = svm_linear_svc.predict(X_test)

# accuracy
test_accuracy_linear_svc = accuracy_score(y_test, y_pred_linear_svc)
print("Test Accuracy LinearSVC:", test_accuracy_linear_svc)


Test Accuracy SVC with C=1: 0.892
Test Accuracy SVC with C=2: 0.888
Test Accuracy LinearSVC with C=1: 0.853
Test Accuracy SVC with C=1: 0.691
Test Accuracy SVC with C=2: 0.756
Test Accuracy LinearSVC: 0.853


In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

param_grid_svc = [
    {'C': [0.1, 1, 10], 'kernel': ['linear']},
    {'C': [0.1, 1, 10], 'kernel': ['rbf'], 'gamma': ['scale', 'auto']}
]

param_grid_linear_svc = [
    {'C': [0.1, 1, 10]}
]

# Créer les classificateurs SVM
svm_svc = SVC()
svm_linear_svc = LinearSVC()

# Configurer la recherche par grille pour SVC
grid_search_svc = GridSearchCV(svm_svc, param_grid_svc, cv=5, scoring='accuracy')

# Configurer la recherche par grille pour LinearSVC
grid_search_linear_svc = GridSearchCV(svm_linear_svc, param_grid_linear_svc, cv=5, scoring='accuracy')

# Exécuter la recherche par grille pour SVC
grid_search_svc.fit(X_train, y_train)

# Exécuter la recherche par grille pour LinearSVC
grid_search_linear_svc.fit(X_train, y_train)

# Sélectionner le meilleur modèle pour SVC
best_model_svc = grid_search_svc.best_estimator_

# Sélectionner le meilleur modèle pour LinearSVC
best_model_linear_svc = grid_search_linear_svc.best_estimator_

# Évaluer le meilleur modèle SVC sur les données de test
y_pred_svc = best_model_svc.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)

# Évaluer le meilleur modèle LinearSVC sur les données de test
y_pred_linear_svc = best_model_linear_svc.predict(X_test)
accuracy_linear_svc = accuracy_score(y_test, y_pred_linear_svc)

print("Meilleur modèle SVM (SVC) :")
print("Hyperparamètres :", grid_search_svc.best_params_)
print("Score de validation croisée :", grid_search_svc.best_score_)
print("Précision sur les données de test :", accuracy_svc)

print("\nMeilleur modèle SVM (LinearSVC) :")
print("Hyperparamètres :", grid_search_linear_svc.best_params_)
print("Score de validation croisée :", grid_search_linear_svc.best_score_)
print("Précision sur les données de test :", accuracy_linear_svc)




Meilleur modèle SVM (SVC) :
Hyperparamètres : {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Score de validation croisée : 0.9057499999999999
Précision sur les données de test : 0.892

Meilleur modèle SVM (LinearSVC) :
Hyperparamètres : {'C': 0.1}
Score de validation croisée : 0.87425
Précision sur les données de test : 0.853


In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

param_grid_lr = {'C': [0.1, 1, 10]}
param_grid_dt = {'max_depth': [None, 10, 20]}
param_grid_knn = {'n_neighbors': [3, 5, 10]}

models = {
    "Logistic Regression": GridSearchCV(LogisticRegression(), param_grid_lr, cv=5),
    "Decision Tree": GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5),
    "KNN": GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
}

for name, model in models.items():
    model.fit(X_train, y_train)

predictions = {}
for name, model in models.items():
    predictions[name] = model.predict(X_test)

performance_measures = {}
for name, preds in predictions.items():
    accuracy = accuracy_score(y_test, preds)
    performance_measures[name] = accuracy

print(performance_measures)


{'Logistic Regression': 0.871, 'Decision Tree': 0.72, 'KNN': 0.641}


Alors , la meilleur precision etait :
 Meilleur modèle SVM (SVC) :
Hyperparamètres : {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Score de validation croisée : 0.9057499999999999
Précision sur les données de test : 0.892 ***texte en gras***