### Text Summarisation

CamemBERT Model

In [5]:
import torch
from transformers import RobertaTokenizerFast, EncoderDecoderModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
ckpt = 'mrm8488/camembert2camembert_shared-finetuned-french-summarization'
tokenizer = RobertaTokenizerFast.from_pretrained(ckpt)
model = EncoderDecoderModel.from_pretrained(ckpt).to(device)

def generate_summary(text):
   inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
   input_ids = inputs.input_ids.to(device)
   attention_mask = inputs.attention_mask.to(device)
   output = model.generate(input_ids, attention_mask=attention_mask)
   return tokenizer.decode(output[0], skip_special_tokens=True)

## summarisation hugging face model : https://huggingface.co/mrm8488/camembert2camembert_shared-finetuned-french-summarization

Downloading tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']


BERT Model

In [None]:
# !pip install bert-extractive-summarizer
from summarizer import Summarizer

# Create a BERT extractive summarizer
summarizer = Summarizer()

In [6]:
import os
import textwrap
import time

files = os.listdir('files')

for i, each in enumerate(files):
    print(f"{i+1}/{str(len(files))}: starting summary for {each}...")

    t1 = time.time()

    with open(f'files/{each}', 'r') as file:
        content = file.read()

    ## BERT MODEL: Generate the summary
    # summary = summarizer(content, min_length=50, max_length=200) 

    ## CAMEMBERT MODEL: Generate the summary
    summary = generate_summary(content)

    wrapped_text = textwrap.fill(summary, width=80)

    # Save the wrapped text to a file
    with open(f'summary/{each}', 'w') as file:
        file.write(wrapped_text)
    
    t2 = time.time()

    print(f"finished in {round(t2-t1, 3)}s!")

df.to_csv("sentiment_output.csv")

1/200: starting summary for PigaultLebrun_Adele.txt...
finished in 4.25s!
2/200: starting summary for Genlis_Emigres.txt...
finished in 4.508s!
3/200: starting summary for Dorat_Malheurs.txt...
finished in 2.683s!
4/200: starting summary for Quesne_Folies.txt...
finished in 5.526s!
5/200: starting summary for Abbes_Voyage.txt...
finished in 3.33s!
6/200: starting summary for Charriere_Observations.txt...
finished in 3.058s!
7/200: starting summary for Loaisel_Comtesse.txt...
finished in 5.487s!
8/200: starting summary for Retif_Paysanne.txt...
finished in 4.29s!
9/200: starting summary for Arnaud_Selicourt.txt...
finished in 4.16s!
10/200: starting summary for Marchadier_Isle.txt...
finished in 3.759s!
11/200: starting summary for Diderot_Jacques.txt...
finished in 3.119s!
12/200: starting summary for Arnaud_Batilde.txt...
finished in 4.058s!
13/200: starting summary for Rousseau_Julie.txt...
finished in 4.926s!
14/200: starting summary for Sade_Infortunes.txt...
finished in 4.208s!
15

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ac0hik/Sentiment_Analysis_French")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("ac0hik/Sentiment_Analysis_French")

## sentiment analysis hugging face model : https://huggingface.co/ac0hik/Sentiment_Analysis_French

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cmarkea/distilcamembert-base-sentiment")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("cmarkea/distilcamembert-base-sentiment")

Downloading tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/272M [00:00<?, ?B/s]

### Sentiment Analysis

In [12]:
import pandas as pd
import torch
import torch.nn.functional as F
import time

df = pd.read_csv('INDH_dataset.tsv', sep='\t')
summary_df = pd.read_csv("sentiment_output.csv")

for i, each in enumerate(summary_df["summary"]):
    print(f"{i+1}/{str(len(summary_df))}: starting sentiment analysis for {each}...")

    t1 = time.time()

    # title = each.split(".txt")[0]

    # with open(f'summary/{each}', 'r') as file:
    #     content = file.read()
    
    inputs = tokenizer(each, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        logits = sentiment_model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    sentiment = sentiment_model.config.id2label[predicted_class_id]

    # Apply softmax to get the probabilities
    probabilities = F.softmax(logits, dim=-1)

    # Get the predicted class ID
    predicted_class_id = logits.argmax().item()

    # Get the predicted sentiment label
    sentiment = sentiment_model.config.id2label[predicted_class_id]

    # Get the probability of the predicted class
    predicted_probability = probabilities[0, predicted_class_id].item()

    # Add prediction label to the DataFrame
    # summary_df.loc[df['summary'] == each, 'summary'] = content
    summary_df.loc[summary_df['summary'] == each, 'sentiment'] = sentiment
    summary_df.loc[summary_df['summary'] == each, 'probability'] = predicted_probability
    
    t2 = time.time()

    print(f"finished in {round(t2-t1, 3)}s!")

summary_df.to_csv("sentiment_output_2.csv")

1/200: starting sentiment analysis for La chronique de Roger-Pol Droit, à propos de "Le Vestibule, Hôtel des Auteurs
François", de Jean-Claude Nouilly....
finished in 0.083s!
2/200: starting sentiment analysis for A la veille de la présidentielle, "Le Monde" donne la parole à ceux qui font
bouger les régions dans le domaine culturel....
finished in 0.024s!
3/200: starting sentiment analysis for L'E fils du potentat comme celui du savetier sont l'ouvrage d'un coup de cul, et
d'une petite marchande de modes de quinze, fraîche comme la plus jeune des
grâces....
finished in 0.021s!
4/200: starting sentiment analysis for Dans sa chronique pour le cahier "Sport&Forme", l'écrivain rend hommage à
l'ancien capitaine de Vaisseau, qui fut son premier voyage en Amérique du Sud....
finished in 0.025s!
5/200: starting sentiment analysis for La chronique de Roger-Pol Droit, à propos de "Madams de Saturnin", de Jean-
Pierre Filiu....
finished in 0.023s!
6/200: starting sentiment analysis for La chroni

In [17]:
## Average probability of predictions (predictive strength): Model 1 - ac0hik/Sentiment_Analysis_French
summary_df_1 = pd.read_csv("sentiment_output.csv")
avg_1 = summary_df_1["probability"].mean()
print(f"Average probability strength of classification for ac0hik/Sentiment_Analysis_French: {round(avg_1, 4)}")

## Average probability of predictions (predictive strength): Model 2 - cmarkea/distilcamembert-base-sentiment
summary_df_2 = pd.read_csv("sentiment_output_2.csv")
avg_2 = summary_df_2["probability"].mean()
print(f"Average probability strength of classification for cmarkea/distilcamembert-base-sentiment: {round(avg_2, 4)}")

Average probability strength of classification for ac0hik/Sentiment_Analysis_French: 0.7853
Average probability strength of classification for cmarkea/distilcamembert-base-sentiment: 0.3864


### Gender & Sentiment Encoding

In [8]:
sentiment_dict = {"negative": 0, "neutral": 1, "positive": 2}
summary_df['sentiment_label'] = [sentiment_dict[x] for x in summary_df['sentiment']]

gender_dict = {"U": 0, "M": 1, "F": 2}
summary_df['gender_label'] = [gender_dict[x] for x in summary_df['au-gender']]

summary_df

Unnamed: 0.1,Unnamed: 0,filename,au-name,au-birth,au-death,title,au-gender,firsted-yr,printSource-yr,form,...,printSource_pubPlace,printSource_publisher,printSource_date,resp_datacapture,resp_encoding,summary,sentiment,probability,sentiment_label,gender_label
0,0,Abbes_Voyage,Guillaume d'Abbes de Cabreroles,1718,1802,Voyage dans les espaces,M,1758,1758,autodiegetic,...,London,unknown,1758,"double keying by ""Jiangsu"", China","Julia Dudar, Christof Schöch","La chronique de Roger-Pol Droit, à propos de ""...",neutral,0.871761,1,1
1,1,AndreSerieys_Comte,"Jean-François André, Antoine Sérieys","unknown, 1755","unknown, 1819",Le comte d'A*** ou les aventures d'un jeune vo...,M,1800,1800,autodiegetic,...,Paris,Monory,1800,Münchener Digitalisierungszentrum,Johanna Konstanciak,"A la veille de la présidentielle, ""Le Monde"" d...",positive,0.858749,2,1
2,2,Anonym_Enfant,Anonymous,unknown,unknown,L'enfant du bordel,U,1800,1800,autodiegetic,...,Paris,unknown,1800,Wikisource contributors,"Henning Gebhard, Johanna Konstanciak",L'E fils du potentat comme celui du savetier s...,neutral,0.841893,1,0
3,3,Anonym_Histoire,Anonymous,unknown,unknown,Histoire d'un peuple nouveau,U,1757,1757,heterodiegetic,...,Londres,aux dépens d'une société de libraires,1757,gallica,Johanna Konstanciak,"Dans sa chronique pour le cahier ""Sport&Forme""...",neutral,0.706780,1,0
4,4,Anonym_Suzon,Anonymous,unknown,unknown,"Mémoires de Suzon, soeur de Dom Bougre",U,1778,1778,autodiegetic,...,London,unknown,1778,Wikisource contributors,Julia Dudar,"La chronique de Roger-Pol Droit, à propos de ""...",neutral,0.870666,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,Voltaire_Oreilles,Voltaire,1694,1778,Les oreilles du comte de Chesterfield,M,1775,1877,mixed,...,Paris,Garnier,1877,Wikisource contributors,"Henning Gebhard, Johanna Konstanciak","Dans sa chronique du cahier ""Sport&Forme"", le ...",neutral,0.756557,1,1
196,196,Voltaire_Princesse,Voltaire,1694,1778,La Princesse de Babylone,M,1768,1877,heterodiegetic,...,Paris,Garnier,1877,Wikisource contributors,Julia Dudar,"Le Palais du ROI de BABYLONE, au cœur de l'Eup...",neutral,0.543228,1,1
197,197,Voltaire_Scarmentado,Voltaire,1694,1778,Histoire des voyages de Scarmentado,M,1756,1877,autodiegetic,...,Paris,Garnier,1877,Wikisource contributors,"Henning Gebhard, Johanna Konstanciak","A la veille de la présidentielle, ""Le Monde"" d...",positive,0.858749,2,1
198,198,Voltaire_Songe,Voltaire,1694,1778,Songe de Platon,M,1756,1877,heterodiegetic,...,Paris,Garnier,1877,Wikisource contributors,"Henning Gebhard, Johanna Konstanciak","La chronique de Roger-Pol Droit, à propos de «...",negative,0.894822,0,1


In [9]:
summary_df.to_csv("sentiment_output.csv")

In [10]:
summary_df["sentiment"].value_counts()

sentiment
neutral     91
positive    58
negative    51
Name: count, dtype: int64