Models Processing

In [11]:
model_paths = {
        'model1': "cardiffnlp/twitter-roberta-base-sentiment-latest",
        'model2': "nlptown/bert-base-multilingual-uncased-sentiment",
        'model3': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
        'model4': "siebert/sentiment-roberta-large-english",
        'model5': "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
        'model6': "finiteautomata/bertweet-base-sentiment-analysis",
        'model7': "j-hartmann/sentiment-roberta-large-english-3-classes"
}

In [12]:
from transformers import pipeline

models = {name : pipeline('sentiment-analysis', model=model_path) for name, model_path in model_paths.items()}

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassifica

Text Importing

In [13]:
import pandas as pd
df = pd.read_excel('posts_first_targil.xlsx', sheet_name=None)

In [14]:
df['J-P'].rename(columns={'Body': 'Body Text'}, inplace=True)

In [15]:
for sheet_name, sheet_df in df.items():
    print(f"Sheet name: {sheet_name}")
    print(sheet_df.columns)

Sheet name: A-J
Index(['sub_title', 'date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: BBC
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: J-P
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: NY-T
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')


In [16]:
with open('israel.txt', 'r') as israel_file, open('palestine.txt', 'r') as palestine_file:
    i_word = israel_file.read().splitlines()
    p_word = palestine_file.read().splitlines()


In [17]:
i_word = [word.lower() for word in i_word]
p_word = [word.lower() for word in p_word]

In [18]:
i_word

['zionism',
 'homeland',
 'security',
 'independence',
 'jerusalem',
 'idf',
 'peace',
 'democracy',
 'resilience',
 'nationhood',
 'unity',
 'innovation',
 'strength',
 'sovereignty',
 'hope',
 'freedom',
 'patriotism',
 'courage',
 'shield',
 'jerusalem',
 'jewish',
 'defense',
 'victorious',
 'innovation',
 'heritage',
 'stability',
 'pride',
 'prosperity',
 'protection',
 'self-defense']

In [19]:
import numpy as np
import pandas as pd

csv1 = []

for sheet_name, sheet_df in df.items():
    for index, row in sheet_df.iterrows():

        article_sentences = []

        if pd.notna(row['Body Text']):
            article_sentences += row['Body Text'].lower().split('.')

        if pd.notna(row['title']):
            article_sentences += row['title'].lower().split('.')

        if sheet_name == 'A-J' and pd.notna(row['sub_title']):
            article_sentences += row['sub_title'].lower().split('.')

        for sentence in article_sentences:
            if any(word in sentence for word in i_word) and not any(word in sentence for word in p_word):
                csv1.append({'paper': sheet_name, 'article_index': index, 'sentence': sentence, 'I/P': 'I'})
            elif any(word in sentence for word in p_word) and not any(word in sentence for word in i_word):
                csv1.append({'paper': sheet_name, 'article_index': index, 'sentence': sentence, 'I/P': 'P'})

In [None]:
from tqdm import tqdm
import warnings

# Label normalization
positive_labels = ['POSITIVE', 'positive', '4 stars', '5 stars', 'POS']
negative_labels = ['NEGATIVE', 'negative', '1 star', '2 stars', 'NEG']

def normalize_label(label):
    if label in positive_labels:
        return 'POS'
    elif label in negative_labels:
        return 'NEG'
    else:
        return 'NUT'

# Chunk splitting
def split_into_chunks(sentence, chunk_size=128):
    words = sentence.split(' ')
    chunks = []

    while len(words) > chunk_size:
        chunks.append(' '.join(words[:chunk_size]))
        words = words[chunk_size:]

    chunks.append(' '.join(words))
    return chunks

# Determine majority label
def determine_majority_label(label_counts):
    return 'POS' if label_counts['POS'] > label_counts['NEG'] and label_counts['POS'] > label_counts['NUT'] else 'NEG' if label_counts['NEG'] > label_counts['POS'] and label_counts['NEG'] > label_counts['NUT'] else 'NUT' if label_counts['NUT'] > label_counts['POS'] and label_counts['NUT'] > label_counts['NEG'] else 'N/A'

# Process model results
def process_model_results(results):
    label_counts = {'POS': 0, 'NEG': 0, 'NUT': 0}
    score_sums = {'POS': 0, 'NEG': 0, 'NUT': 0}

    for result in results:
        label = normalize_label(result['label'])
        score = result['score']

        if label in label_counts:
            label_counts[label] += 1
            score_sums[label] += score

    majority_label = determine_majority_label(label_counts)
    average_score = score_sums[majority_label] / label_counts[majority_label] if majority_label != 'N/A' else 1

    return {
        'label': majority_label,
        'score': average_score,
    }

# Process a single sentence
def process_sentence(sentence, models):
    pos, neg, nut = 0, 0, 0
    sum_pos, sum_neg, sum_nut = 0, 0, 0

    split_sentence = split_into_chunks(sentence['sentence'])



    for model_name, model in models.items():

        processed_results = model(sentence['sentence'], truncation=True, max_length=128)[0] if model_name == 'model6' else model(sentence['sentence'])[0]
        # processed_results = process_model_results(model_results)

        sentence[f'{model_name} score'] = processed_results['score']
        sentence[f'{model_name} label'] = processed_results['label']

        # Aggregate counts and scores
        pos = pos + 1 if processed_results['label'] == 'POS' else pos
        neg = neg + 1 if processed_results['label'] == 'NEG' else neg
        nut = nut + 1 if processed_results['label'] == 'NUT' else nut

        sum_pos = sum_pos + processed_results['score'] if processed_results['label'] == 'POS' else sum_pos
        sum_neg = sum_neg + processed_results['score'] if processed_results['label'] == 'NEG' else sum_neg
        sum_nut = sum_nut + processed_results['score'] if processed_results['label'] == 'NUT' else sum_nut

    label_counts = {'POS': pos, 'NEG': neg, 'NUT': nut}
    score_sums = {'POS': sum_pos, 'NEG': sum_neg, 'NUT': sum_nut}

    sentence['majority'] = determine_majority_label(label_counts)
    sentence['AVG-majority-score'] = score_sums[sentence['majority']] / label_counts[sentence['majority']] if sentence['majority'] != 'N/A' else 'N/A'

# Process the CSV data
def process_csv(csv_data, models):
    for sentence in tqdm(csv_data, colour='green'):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            process_sentence(sentence, models)

process_csv(csv1, models)

  0%|[32m          [0m| 6/8368 [00:02<59:56,  2.33it/s]  

In [None]:
df_csv1 = pd.DataFrame(csv1)
df_csv1.to_csv('csv1.csv', index=False)

In [None]:
from tqdm import tqdm
import warnings

# Label normalization
positive_labels = ['POSITIVE', 'positive', '4 stars', '5 stars', 'POS']
negative_labels = ['NEGATIVE', 'negative', '1 star', '2 stars', 'NEG']

def normalize_label(label):
    if label in positive_labels:
        return 'POS'
    elif label in negative_labels:
        return 'NEG'
    else:
        return 'NUT'

# Chunk splitting
def split_into_chunks(sentence, chunk_size=128):
    words = sentence.split(' ')
    chunks = []

    while len(words) > chunk_size:
        chunks.append(' '.join(words[:chunk_size]))
        words = words[chunk_size:]

    chunks.append(' '.join(words))
    return chunks

# Determine majority label
def determine_majority_label(label_counts):
    return 'POS' if label_counts['POS'] >= 4 else 'NEG' if label_counts['NEG'] >= 4 else 'NUT' if label_counts['NUT'] >= 4 else 'N/A'

# Process model results
def process_model_results(results):
    label_counts = {'POS': 0, 'NEG': 0, 'NUT': 0}
    score_sums = {'POS': 0, 'NEG': 0, 'NUT': 0}

    for result in results:
        label = normalize_label(result['label'])
        score = result['score']

        if label in label_counts:
            label_counts[label] += 1
            score_sums[label] += score

    majority_label = determine_majority_label(label_counts)
    average_score = score_sums[majority_label] / label_counts[majority_label] if majority_label != 'N/A' else 1

    return {
        'label': majority_label,
        'score': average_score,
    }

# Process a single sentence
def process_sentence(sentence, models):
    pos, neg, nut = 0, 0, 0
    sum_pos, sum_neg, sum_nut = 0, 0, 0

    split_sentence = split_into_chunks(sentence['sentence'])



    for model_name, model in models.items():

        processed_results = model(sentence['sentence'], truncation=True, max_length=128)[0] if model_name == 'model6' else model(sentence['sentence'])[0]
        # processed_results = process_model_results(model_results)

        sentence[f'{model_name} score'] = processed_results['score']
        sentence[f'{model_name} label'] = processed_results['label']

        # Aggregate counts and scores
        pos = pos + 1 if processed_results['label'] == 'POS' else pos
        neg = neg + 1 if processed_results['label'] == 'NEG' else neg
        nut = nut + 1 if processed_results['label'] == 'NUT' else nut

        sum_pos = sum_pos + processed_results['score'] if processed_results['label'] == 'POS' else sum_pos
        sum_neg = sum_neg + processed_results['score'] if processed_results['label'] == 'NEG' else sum_neg
        sum_nut = sum_nut + processed_results['score'] if processed_results['label'] == 'NUT' else sum_nut

    label_counts = {'POS': pos, 'NEG': neg, 'NUT': nut}
    score_sums = {'POS': sum_pos, 'NEG': sum_neg, 'NUT': sum_nut}

    sentence['majority'] = determine_majority_label(label_counts)
    sentence['AVG-majority-score'] = score_sums[sentence['majority']] / label_counts[sentence['majority']] if sentence['majority'] != 'N/A' else 'N/A'

# Process the CSV data
def process_csv(csv_data, models):
    for sentence in tqdm(csv_data, colour='green'):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            process_sentence(sentence, models)

process_csv(csv1, models)

In [None]:
df_csv2 = pd.DataFrame(csv1)
df_csv2.to_csv('csv2.csv', index=False)

In [None]:
def get_highest_label(label_counts):

    # Find the maximum value in the dictionary
    max_value = max(label_counts.values())

    # Find all keys with the maximum value
    highest_labels = [key for key, value in label_counts.items() if value == max_value]

    # Return the label if there is only one; otherwise, return 'N/A' for ties
    return highest_labels[0] if len(highest_labels) == 1 else 'N/A'

In [None]:
papers_df = []
for paper in df_csv1['paper'].unique():
    paper_df = df_csv1[df_csv1['paper'] == paper]

    articles = []

    for article_index in paper_df['article_index'].unique():

        article_df = df_csv1[df_csv1['article_index'] == article_index]
        label_counts = {'POS-I': 0, 'POS-P': 0, 'NUT': 0}
        score_sums = {'POS-I': 0, 'POS-P': 0, 'NUT': 0}

        for index, row in article_df.iterrows():

            if (row['majority'] == 'POS' and row['I/P'] == 'I') or (row['majority'] == 'NEG' and row['I/P'] == 'P'):
                label_counts['POS-I'] += 1
                score_sums['POS-I'] += row['AVG-majority-score']
            elif (row['majority'] == 'POS' and row['I/P'] == 'P') or (row['majority'] == 'NEG' and row['I/P'] == 'I'):
                label_counts['POS-P'] += 1
                score_sums['POS-P'] += row['AVG-majority-score']
            elif row['majority'] == 'NUT':
                label_counts['NUT'] += 1
                score_sums['NUT'] += row['AVG-majority-score']

        majority_label = get_highest_label(label_counts)
        articles.append({
            'article_index': article_index,
            'majority': majority_label,
            'score': score_sums[majority_label] / label_counts[majority_label] if majority_label != 'N/A' else 1,
        })

    papers_df.append((pd.DataFrame(articles), paper))

In [None]:
with pd.ExcelWriter('excel2.xlsx') as writer:
    for df, sheet_name in papers_df:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
for df, sheet_name in papers_df:
    pro_classification = df['majority'].value_counts()
    pro_classification = pro_classification.to_dict()
    pro_classification['N/A'] = 0

    decided_pro = get_highest_label(pro_classification)
    score = df[df['majority'] == decided_pro]['score'].mean() if decided_pro != 'N/A' else 'N/A'
    print(f'paper: {sheet_name} label: {decided_pro} score: {score}')