## Ecrire un programme qui calcule la similarité entre deux documents avec deux approches différentes :

### Uniquement en utilisant la comparaison lexicale des mots des deux documents et en utilisant WodNet

In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Télécharger les ressources nécessaires de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenisation des mots
    tokens = word_tokenize(text.lower())
    # Suppression des mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatisation des mots
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

def lexical_similarity(doc1, doc2):
    tokens1 = preprocess_text(doc1)
    tokens2 = preprocess_text(doc2)
    
    # Construction des ensembles de mots uniques pour chaque document
    word_set1 = set(tokens1)
    word_set2 = set(tokens2)
    
    # Calcul de la similarité lexicale en utilisant l'intersection des ensembles de mots
    print('word_set1 inter word_set2'+str(word_set1.intersection(word_set2)))
    print('word_set1 inter word_set2'+str(word_set2.intersection(word_set1)))
    similarity = len(word_set1.intersection(word_set2)) / float(len(word_set1.union(word_set2)))
    return similarity

def wordnet_similarity(doc1, doc2):
    tokens1 = preprocess_text(doc1)
    tokens2 = preprocess_text(doc2)
    
    # Calcul de la similarité en utilisant WordNet
    synsets1 = set()
    synsets2 = set()
    
    # Obtenir les synsets pour chaque mot dans le premier document
    for token in tokens1:
        synsets = wordnet.synsets(token)
        synsets1.update(synsets)
    
    # Obtenir les synsets pour chaque mot dans le deuxième document
    for token in tokens2:
        synsets = wordnet.synsets(token)
        synsets2.update(synsets)
    
    # Calcul de la similarité en utilisant le coefficient de similarité de path
    max_similarity = 0
    for synset1 in synsets1:
        for synset2 in synsets2:
            similarity = synset1.path_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity

# Exemple d'utilisation
document1 = "The cat sits on the mat."
document2 = "The dog is on the mat."

lex_similarity = lexical_similarity(document1, document2)
wrndt_similarity = wordnet_similarity(document1, document2)

print("Similarité lexicale:", lex_similarity)
print("Similarité WordNet:", wrndt_similarity)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


word_set1 inter word_set2{'mat', '.'}
word_set1 inter word_set2{'mat', '.'}
Similarité lexicale: 0.4
Similarité WordNet: 1.0


In [3]:
document3 = "everything everywhere all at once ."
document4 = "The dog is on the mat."

lex_similarity = lexical_similarity(document3, document4)
wrdnt_similarity = wordnet_similarity(document3, document4)

print("Similarité lexicale:", lex_similarity)
print("Similarité WordNet:", wrdnt_similarity)

word_set1 inter word_set2{'.'}
word_set1 inter word_set2{'.'}
Similarité lexicale: 0.2
Similarité WordNet: 0.3333333333333333


## Utiliser un classificateur basé sur votre implémentation de la régression logistique pour implémenter un système d'analyse de sentiment:

In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler

class LogisticRegression:
    
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
        self.scaler = StandardScaler()  # StandardScaler for feature scaling

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        # Scale the features
        X = self.scaler.fit_transform(X)

        # Gradient descent optimization
        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)

            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / num_samples) * np.sum(y_pred - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        # Scale the features
        X = self.scaler.transform(X)
        
        # Predict the probability of the positive class
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(linear_model)
        
        # Convert probability to class labels
        y_pred_class = np.where(y_pred > 0.5, 1, 0)
        return y_pred_class

    def score(self, X, y):
        y_pred = self.predict(X)
        accuracy = np.mean(y_pred == y)
        return accuracy

In [5]:
import pandas as pd

tweets = pd.read_csv('/content/data.csv')
tweets.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,what did just say vote for modi welcome bjp t...,1.0
2,asking his supporters prefix chowkidar their n...,1.0
3,answer who among these the most powerful world...,1.0
4,with upcoming election india saga going import...,1.0


In [6]:
tweets.isnull().sum()

clean_text    2
category      0
dtype: int64

In [7]:
# Text preprocessing
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def expand_contractions(sentence):
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "I'd": "I would",
        "I'd've": "I would have",
        "I'll": "I will",
        "I'll've": "I will have",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }

    # Create a regular expression pattern to match the contractions
    pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')

    # Function to replace the matched contractions with their expanded forms
    def replace(match):
        return contractions[match.group(0)]

    # Use the sub() function with the replace function to expand contractions
    expanded_sentence = re.sub(pattern, replace, sentence)

    return expanded_sentence

def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token, 'v') for token in tokens]
    lemmatized_sentence = ' '.join(lemmatized_tokens)
    return lemmatized_sentence

def delete_stopwords(sentence):
    tokens = word_tokenize(sentence)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_sentence = ' '.join(filtered_tokens)
    return filtered_sentence

def text_cleaning(x):
    # To replace consecutive whitespace and newline characters with a single space character in the string
    q = re.sub('\s+\n+', ' ', x)
    
    # To lowercase
    q = q.lower()
    
    # expand contractions
    q = expand_contractions(q)
    
    # To replace any non-alphanumeric character in the string 
    q = re.sub('[^a-zA-Z0-9]', ' ', q)
    
    # Lemmatization
    q = lemmatize_sentence(q)
    
    return delete_stopwords(q)

# Cleaning
def clean(data):
    data.dropna(axis=0, inplace=True)
    data.drop_duplicates(inplace=True)
    
    data['clean_text'] = data['clean_text'].apply(text_cleaning)
    data.dropna(axis=0, inplace=True)
    data.drop_duplicates(inplace=True)
    return data

In [8]:
from copy import deepcopy

In [9]:
copy_data = deepcopy(tweets)
cleaned_data = clean(copy_data)
cleaned_data.head()

Unnamed: 0,clean_text,category
0,modi promise minimum government maximum govern...,-1.0
1,say vote modi welcome bjp tell rahul main camp...,1.0
2,ask supporters prefix chowkidar name modi grea...,1.0
3,answer among powerful world leader today trump...,1.0
4,upcoming election india saga go important pair...,1.0


In [10]:
cleaned_data.duplicated().sum()

0

In [11]:
cleaned_data.shape

(105352, 2)

In [12]:
cleaned_data.isnull().sum()

clean_text    0
category      0
dtype: int64

In [13]:
tweets_list = list(cleaned_data['clean_text'].apply(lambda x: x.split()))
tweets_list[0] # list of lists, where each tweet is a list of tokens, finally we have a list of tweets

['modi',
 'promise',
 'minimum',
 'government',
 'maximum',
 'governance',
 'expect',
 'begin',
 'difficult',
 'job',
 'reform',
 'state',
 'take',
 'years',
 'get',
 'justice',
 'state',
 'business',
 'exit',
 'psus',
 'temples']

In [14]:
from gensim.models import Word2Vec
# train model
w2v_model = Word2Vec(tweets_list, vector_size = 100, window = 5, min_count=5, sg=0)

In [15]:
len(w2v_model.wv.index_to_key)

14501

In [16]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in w2v_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(w2v_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

In [17]:
tweets_temp = cleaned_data['clean_text'].apply(document_vector)

  wv1_mean = wv1_.mean(axis=0)


In [18]:
tweets_temp.shape

(105352,)

In [19]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 100
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(105352, 100)

In [20]:
# Create a new DF to store these new documnent features
def labels(x):
  return 1 if x == 1.0 else 0
df = pd.DataFrame(tweets_vec)
df['y'] = cleaned_data['category'].apply(labels)
df.dropna(axis=0, inplace=True)
df['y'] = df['y'].astype(int)


In [21]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,y
105347,-0.109156,0.218626,-0.094113,-0.050349,-0.656277,-0.883146,0.066947,0.244081,-0.131661,-0.479267,...,0.244661,0.277166,0.359339,0.164642,0.141335,0.085223,-0.301285,0.174231,-0.013893,1
105348,0.151471,-0.030912,-0.077932,0.459758,-0.705213,-0.586198,0.548455,0.348201,-0.624291,-0.122178,...,-0.214388,-0.174044,-0.075074,0.358728,0.4791,0.429342,-1.042958,0.271623,0.218519,0
105349,0.409441,0.209597,-0.9263,-0.082187,-0.632302,0.039838,0.159867,0.279283,-1.058787,-0.229327,...,0.003318,0.087381,0.05275,0.938481,0.629492,-0.203367,0.08305,0.286326,-0.449901,1
105350,-0.132154,0.090676,0.13629,0.083055,-0.358178,-0.684826,-0.001598,0.740921,-0.419746,-0.04507,...,0.074847,-0.200783,-0.151354,0.388028,0.435986,0.141373,-0.323119,-0.062025,0.327149,1
105351,-0.288728,0.161791,0.058654,-0.232482,-0.571144,-0.761337,0.177801,1.065071,-0.198533,-0.055657,...,0.036758,-0.235612,-0.460114,0.236416,0.351797,-0.007563,-0.469839,0.376313,0.127802,1


In [22]:
X = df.drop('y',axis=1)
y = df['y']
X.shape

(102988, 100)

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

len(X_train)

82390

In [24]:
LR_model = LogisticRegression()
LR_model.fit(X_train, y_train)

In [25]:
df['y'].info()

<class 'pandas.core.series.Series'>
Int64Index: 102988 entries, 0 to 105351
Series name: y
Non-Null Count   Dtype
--------------   -----
102988 non-null  int64
dtypes: int64(1)
memory usage: 1.6 MB


In [26]:
y_pred=LR_model.predict(X_test)

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6846
           1       0.67      1.00      0.80     13752

    accuracy                           0.67     20598
   macro avg       0.33      0.50      0.40     20598
weighted avg       0.45      0.67      0.53     20598



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
