<a href="https://colab.research.google.com/github/the77hnx/works-in-the-univ/blob/main/Control_Tp_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Akram Belhadi AI & DataScience***

In [82]:
# Import necessary libraries
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [83]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [84]:
# Load dataset (replace with your file path if required)
file_path = '/content/medium_data.csv'  # Update path in Colab
data = pd.read_csv(file_path)

In [85]:
data.columns

Index(['id', 'url', 'title', 'subtitle', 'image', 'claps', 'responses',
       'reading_time', 'publication', 'date'],
      dtype='object')

In [86]:
data.isnull().sum()

Unnamed: 0,0
id,0
url,0
title,0
subtitle,3029
image,147
claps,0
responses,0
reading_time,0
publication,0
date,0


In [87]:
data.shape

(6508, 10)

In [88]:
# Step 1: Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [89]:
# Step 2: Remove special characters
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

In [90]:
# Step 3: Remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [91]:
# Step 4: Remove stop words
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

In [92]:
# Step 5: Convert to lowercase
def to_lowercase(text):
    return text.lower()

In [93]:
# Step 6: Tokenization (Splitting into individual words)
def tokenize_text(text):
    return word_tokenize(text)

In [94]:
# Step 7: Convert text to sequences
vectorizer = CountVectorizer()

def text_to_sequences(texts):
    vectorizer.fit(texts)
    sequences = vectorizer.transform(texts).toarray()
    return sequences

In [95]:
# Generate progressive n-grams from padded sequences
def generate_progressive_ngrams_from_sequences(padded_sequences):
    ngrams_list = []
    for sequence in padded_sequences:
        tokens = [str(token) for token in sequence if token != 0]  # Exclude padding (0)
        ngrams = []
        for n in range(1, len(tokens) + 1):  # Generate n-grams progressively
            ngrams.extend([' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)])
        ngrams_list.append(ngrams)
    return ngrams_list

In [96]:
# Step 9: Pad sequences to make them the same length
def pad_tokenized_sequences(tokenized_titles, max_length=None):
    if not max_length:
        max_length = max(len(tokens) for tokens in tokenized_titles)  # Determine the max length if not specified
    padded_sequences = pad_sequences(tokenized_titles, maxlen=max_length, padding='post', value=0)
    return padded_sequences

In [97]:
# Apply preprocessing steps to the 'titles' column
data['cleaned_title'] = data['title'].apply(remove_punctuation)
data['cleaned_title'] = data['cleaned_title'].apply(remove_special_characters)
data['cleaned_title'] = data['cleaned_title'].apply(remove_html_tags)
data['cleaned_title'] = data['cleaned_title'].apply(to_lowercase)
data['cleaned_title'] = data['cleaned_title'].apply(remove_stop_words)

In [98]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['cleaned_title'])

data['tokenized_title'] = tokenizer.texts_to_sequences(data['cleaned_title'])
tokenized_titles = data['tokenized_title'].tolist()

In [99]:
# Add padded sequences as a new column
data['padded_tokenized_title'] = list(padded_sequences)

In [100]:
# Generate progressive n-grams from padded sequences
progressive_ngrams = generate_progressive_ngrams_from_sequences(padded_sequences)
data['progressive_ngrams'] = progressive_ngrams

In [101]:
# Save the processed dataset
data.to_csv('/content/processed_medium_data.csv', index=False)

In [102]:
# Display the first few rows of the processed dataset
data[['title', 'cleaned_title', 'tokenized_title', 'padded_tokenized_title', 'progressive_ngrams']].head()

Unnamed: 0,title,cleaned_title,tokenized_title,padded_tokenized_title,progressive_ngrams
0,A Beginner’s Guide to Word Embedding with Gens...,beginners guide word embedding gensim word2vec...,"[203, 27, 354, 1365, 2306, 3562, 50]","[203, 27, 354, 1365, 2306, 3562, 50, 0, 0, 0, ...","[203, 27, 354, 1365, 2306, 3562, 50, 203 27, 2..."
1,Hands-on Graph Neural Networks with PyTorch & ...,handson graph neural networks pytorch pytorch ...,"[3563, 622, 38, 54, 254, 254, 1704]","[3563, 622, 38, 54, 254, 254, 1704, 0, 0, 0, 0...","[3563, 622, 38, 54, 254, 254, 1704, 3563 622, ..."
2,How to Use ggplot2 in Python,use ggplot2 python,"[26, 3564, 11]","[26, 3564, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[26, 3564, 11, 26 3564, 3564 11, 26 3564 11]"
3,Databricks: How to Save Files in CSV on Your L...,databricks save files csv local computer,"[3565, 186, 1366, 2307, 809, 195]","[3565, 186, 1366, 2307, 809, 195, 0, 0, 0, 0, ...","[3565, 186, 1366, 2307, 809, 195, 3565 186, 18..."
4,A Step-by-Step Implementation of Gradient Desc...,stepbystep implementation gradient descent bac...,"[810, 382, 1705, 1126, 1367]","[810, 382, 1705, 1126, 1367, 0, 0, 0, 0, 0, 0,...","[810, 382, 1705, 1126, 1367, 810 382, 382 1705..."


In [103]:
# Function to split padded sequences into features and labels
def split_features_labels(padded_sequences):
    features, labels = [], []
    for sequence in padded_sequences:
        # Remove zeros (padding)
        sequence = [word for word in sequence if word != 0]
        if len(sequence) > 1:  # Ensure sequence has at least one feature and one label
            features.append(sequence[:-1])  # All words except the last one
            labels.append(sequence[-1])    # The last word as label
        else:
            features.append([])  # Handle sequences with a single word (no features)
            labels.append(sequence[0]) if sequence else labels.append(0)  # Single word as label or 0
    return features, labels

In [104]:
# Apply the function to padded_tokenized_title
padded_sequences = data['padded_tokenized_title'].tolist()
features, labels = split_features_labels(padded_sequences)


In [105]:
# Add features and labels to the dataset
data['features'] = features
data['label'] = labels

In [106]:
# Save the processed dataset
data.to_csv('/content/last_processed_with_features_labels.csv', index=False)

In [110]:
# Display the first few rows of the processed dataset
data[['padded_tokenized_title', 'features', 'label']].head()

Unnamed: 0,padded_tokenized_title,features,label
0,"[203, 27, 354, 1365, 2306, 3562, 50, 0, 0, 0, ...","[203, 27, 354, 1365, 2306, 3562]",50
1,"[3563, 622, 38, 54, 254, 254, 1704, 0, 0, 0, 0...","[3563, 622, 38, 54, 254, 254]",1704
2,"[26, 3564, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[26, 3564]",11
3,"[3565, 186, 1366, 2307, 809, 195, 0, 0, 0, 0, ...","[3565, 186, 1366, 2307, 809]",195
4,"[810, 382, 1705, 1126, 1367, 0, 0, 0, 0, 0, 0,...","[810, 382, 1705, 1126]",1367
