
- Perform sentiment analysis on the scraped data to determine the general attitude and interest levels regarding clinical trials.


In [None]:
from transformers import pipeline, set_seed
import pandas as pd
import os
import matplotlib.pyplot as plt

from transformers import pipeline
set_seed(42)

sentiment_pipeline = pipeline("sentiment-analysis", max_length=512)

In [None]:
def dataloader(file_path):
	data = pd.read_csv(file_path)
	return data


def fillna(data):
	data['selftext'].fillna(data['title'], inplace=True)
	return data

def get_sentiment_score(text):

    try:
        sentiment = sentiment_pipeline(text)[0]
        return sentiment['score'] if sentiment['label'] == 'POSITIVE' else -sentiment['score']
    except:
        return 0



In [None]:

def preprocess_data(file_path):

	data = dataloader(file_path)
	data = fillna(data)
	data['sentiment_score'] = data['selftext'].apply(lambda x: get_sentiment_score(x))
	data['sentiment_label'] = data['sentiment_score'].apply(lambda x: get_sentiment_label(x))
	data.to_csv(f'preprocessed_{file_path}', index=False)
	return data


def plot_sentiment_score_distribution(data, file_path):
	data['sentiment_score'].plot(kind='hist', bins=100)
	plt.xlabel('Sentiment Score')
	plt.title(f'Sentiment Score Distribution - {file_path.split("_")[0].capitalize()}')
	plt.savefig(f'../data/processed/{file_path}_sentiment_score_distribution.png', dpi=400)


for file in os.listdir('raw_data'):
	if file.endswith('.csv'):
		data = preprocess_data(f'raw_data/{file}')
		plot_sentiment_score_distribution(data, file.split('.')[0])
