In [None]:
!pip install PyPDF2
!pip install spacy
!python -m spacy download fr_core_news_sm


import PyPDF2
import spacy
from collections import Counter
import nltk
from nltk.corpus import stopwords
import sys

nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('french'))

nlp = spacy.load("fr_core_news_sm")

extra_stopwords = {"plus", "tout", "comme", "a", "aux", "ainsi", "alors", "après", "aucun", "aucune",
                   "autre", "avant", "avec", "avoir", "ça", "car", "ce", "cela", "cette", "cet", "ces",
                   "cette", "dans", "donc", "dont", "du", "elle", "en", "est", "et", "eux", "il", "ils",
                   "je", "là", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même", "mes",
                   "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "où", "par", "pas", "pour",
                   "quand", "que", "qui", "sa", "se", "ses", "si", "son", "sont", "sur", "ta", "te",
                   "tes", "toi", "ton", "tous", "tout", "trop", "tu", "un", "une", "vos", "votre",
                   "vous", "y" , "of" , "the" , "in" , "for" , "b" , "and" , "vol" , "e" , "r" , "f" ,
                   "v" , "ser" , "loi" , "re" , "sie" , "don" , "mots" , "titre" ,  "article", "code",
                   "art", "alinéa", "mots", "remplace", "etat", "État",
                   "rédigé", "disposition", "titre", "ar", "décret", "ministre", "ii",
                   "modifié", "président", "mentionnés", "présent", "articles", "cas",
                   "modifie", "présente", "général", "représentant", "texte" , "ticle"
                    , "genre" , "Conditions" , "Tacle" , "Charge"
                   }
                   #"disposition" , "ar" , "ministre" , "ii" , "modifie" , "cas" , "article" , "code"


french_stopwords = nltk_stopwords.union(extra_stopwords).union(nlp.Defaults.stop_words)

def extract_text_from_pdf(pdf_path):
    """استخراج متن از PDF"""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + " "
    return text

def process_text(text, top_n):
    """پردازش متن: حذف stopwords و شمارش پرتکرارترین کلمات"""
    doc = nlp(text.lower())
    words = [token.text for token in doc if token.is_alpha]
    filtered_words = [word for word in words if word not in french_stopwords]  # stopwords
    word_frequencies = Counter(filtered_words)  # counting

    return word_frequencies.most_common(top_n)  # return

# upload file in Colab
from google.colab import files
uploaded = files.upload()

# full name
pdf_path = list(uploaded.keys())[0]

# how many top used words
top_n = int(input("how many top used words do you want to see?"))

# processing file and showing the output
text = extract_text_from_pdf(pdf_path)
most_common_words = process_text(text, top_n)

# show results
print("\nmost used words in this file:(after removing Stopwords):")
for word, freq in most_common_words:
    print(f"{word}: {freq}")

import matplotlib.pyplot as plt

def plot_word_distribution(word_counts):
    # extract words and how many are them
    words, counts = zip(*word_counts)

    # bar chart
    plt.figure(figsize=(10, 5))
    plt.bar(words, counts, color='skyblue')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('Top Words - Bar Chart')
    plt.xticks(rotation=45)
    plt.show()

    # pie chart
    plt.figure(figsize=(7, 7))
    plt.pie(counts, labels=words, autopct='%1.1f%%', startangle=90)
    plt.title('Top Words - Pie Chart')
    plt.show()

# show plts
plot_word_distribution(most_common_words)
