Models Processing

In [1]:
model_paths = {
        'model1': "cardiffnlp/twitter-roberta-base-sentiment-latest",
        'model2': "nlptown/bert-base-multilingual-uncased-sentiment",
        'model3': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
        'model4': "siebert/sentiment-roberta-large-english",
        'model5': "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
        'model6': "finiteautomata/bertweet-base-sentiment-analysis",
        'model7': "j-hartmann/sentiment-roberta-large-english-3-classes"
}

In [2]:
from transformers import pipeline

models = {name : pipeline('sentiment-analysis', model=model_path) for name, model_path in model_paths.items()}

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device se

Text Importing

In [3]:
import pandas as pd
df = pd.read_excel('posts_first_targil.xlsx', sheet_name=None)

In [4]:
df['J-P'].rename(columns={'Body': 'Body Text'}, inplace=True)

In [5]:
for sheet_name, sheet_df in df.items():
    print(f"Sheet name: {sheet_name}")
    print(sheet_df.columns)

Sheet name: A-J
Index(['sub_title', 'date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: BBC
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: J-P
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: NY-T
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')


In [6]:
with open('israel.txt', 'r') as israel_file, open('palestine.txt', 'r') as palestine_file:
    i_word = israel_file.read().splitlines()
    p_word = palestine_file.read().splitlines()


In [7]:
import numpy as np
import pandas as pd

csv1 = []

for sheet_name, sheet_df in df.items():
    for index, row in sheet_df.iterrows():

        article_sentences = []

        if pd.notna(row['Body Text']):
            article_sentences += row['Body Text'].split('.')

        if pd.notna(row['title']):
            article_sentences += row['title'].split('.')

        if sheet_name == 'A-J' and pd.notna(row['sub_title']):
            article_sentences += row['sub_title'].split('.')

        for sentence in article_sentences:
            if any(word in sentence for word in i_word) and not any(word in sentence for word in p_word):
                csv1.append({'paper': sheet_name, 'article_index': index, 'sentence': sentence, 'I/P': 'I'})
            elif any(word in sentence for word in p_word) and not any(word in sentence for word in i_word):
                csv1.append({'paper': sheet_name, 'article_index': index, 'sentence': sentence, 'I/P': 'P'})

In [19]:
from tqdm import tqdm

# Label normalization
positive_labels = ['POSITIVE', 'positive', '4 stars', '5 stars', 'POS']
negative_labels = ['NEGATIVE', 'negative', '1 star', '2 stars', 'NEG']

def normalize_label(label):
    if label in positive_labels:
        return 'POS'
    elif label in negative_labels:
        return 'NEG'
    else:
        return 'NUT'

# Chunk splitting
def split_into_chunks(sentence, chunk_size=128):
    words = sentence.split(' ')
    chunks = []

    while len(words) > chunk_size:
        chunks.append(' '.join(words[:chunk_size]))
        words = words[chunk_size:]

    chunks.append(' '.join(words))
    return chunks

# Determine majority label
def determine_majority_label(label_counts):
    return max(label_counts, key=label_counts.get, default='N/A')

# Calculate average score
def calculate_average_score(label, score_sums, label_counts):
    return score_sums[label] / label_counts[label] if label_counts[label] > 0 else 'N/A'

# Process model results
def process_model_results(results):
    label_counts = {'POS': 0, 'NEG': 0, 'NUT': 0}
    score_sums = {'POS': 0, 'NEG': 0, 'NUT': 0}

    for result in results:
        label = normalize_label(result['label'])
        score = result['score']

        if label in label_counts:
            label_counts[label] += 1
            score_sums[label] += score

    majority_label = determine_majority_label(label_counts)
    average_score = calculate_average_score(majority_label, score_sums, label_counts) if majority_label != 'N/A' else 1

    return {
        'label': majority_label,
        'score': average_score,
        'counts': label_counts,
        'scores': score_sums
    }

# Process a single sentence
def process_sentence(sentence, models):
    pos, neg, nut = 0, 0, 0
    sum_pos, sum_neg, sum_nut = 0, 0, 0

    split_sentence = split_into_chunks(sentence['sentence'])

    for model_name, model in models.items():
        model_results = model(split_sentence)
        processed_results = process_model_results(model_results)

        sentence[f'{model_name} score'] = processed_results['score']
        sentence[f'{model_name} label'] = processed_results['label']

        # Aggregate counts and scores
        pos += processed_results['counts']['POS']
        neg += processed_results['counts']['NEG']
        nut += processed_results['counts']['NUT']

        sum_pos += processed_results['scores']['POS']
        sum_neg += processed_results['scores']['NEG']
        sum_nut += processed_results['scores']['NUT']

    label_counts = {'POS': pos, 'NEG': neg, 'NUT': nut}
    score_sums = {'POS': sum_pos, 'NEG': sum_neg, 'NUT': sum_nut}

    sentence['majority'] = determine_majority_label(label_counts)
    sentence['AVG-majority-score'] = calculate_average_score(
        sentence['majority'], score_sums, label_counts
    )

    return sentence

# Process the CSV data
def process_csv(csv_data, models):
    for sentence in tqdm(csv_data, colour='green'):
        process_sentence(sentence, models)

process_csv(csv1, models)

 22%|[32m██▏       [0m| 1300/6043 [23:22<1:25:17,  1.08s/it]


IndexError: index out of range in self

In [None]:
df_csv1 = pd.DataFrame(csv1)
df_csv1.to_csv('csv1.csv', index=False)