In [None]:
#Loading and Exploring Data


import pandas as pd

# Load the datasets
train_data_path = 'drugLibTrain_raw.tsv'
test_data_path = 'drugLibTest_raw.tsv'

# Reading the data
train_data = pd.read_csv(train_data_path, sep='\t')
test_data = pd.read_csv(test_data_path, sep='\t')

# Displaying the first few rows of the training data to understand its structure
train_data.head()

In [None]:
#Preprocessing the Data


# Handle missing values
train_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)

# Additional preprocessing steps
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Tokenize and remove stop words
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin tokens into a single string
    return ' '.join(tokens)

# Apply preprocessing to each text column
for column in ['benefitsReview', 'sideEffectsReview', 'commentsReview']:
    train_data[column] = train_data[column].str.lower().apply(preprocess_text)
    test_data[column] = test_data[column].str.lower().apply(preprocess_text)
    
train_data.head()

In [None]:
#Text Data Analysis and Feature Extraction
#Combined Approach: Useful when the context of the text across different columns is related and can be considered as a whole.
    

# 1. Import Necessary Libraries for Analysis
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming train_data and test_data are already loaded and preprocessed

# 2. Concatenate Text Columns (If needed)
# Combining 'benefitsReview', 'sideEffectsReview', 'commentsReview' into a single column
train_data['combined_text'] = train_data[['benefitsReview', 'sideEffectsReview', 'commentsReview']].agg(' '.join, axis=1)
test_data['combined_text'] = test_data[['benefitsReview', 'sideEffectsReview', 'commentsReview']].agg(' '.join, axis=1)

# 3. Exploratory Analysis: Word Frequency
all_text = ' '.join(train_data['combined_text'])
word_counts = Counter(all_text.split())
most_common_words = word_counts.most_common(30)
print("Most Common Words:", most_common_words)

words, counts = zip(*most_common_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.title('Top 30 Most Common Words')
plt.show()

# 4. Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can tune these parameters
train_tfidf = tfidf_vectorizer.fit_transform(train_data['combined_text'])
test_tfidf = tfidf_vectorizer.transform(test_data['combined_text'])

# Now, train_tfidf and test_tfidf are ready for use in machine learning models.


In [None]:
#Text Data Analysis and Feature Extraction
#Separate Approach: Better when each text column has distinct contextual information that you want to capture individually.


# 1. Import Necessary Libraries
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

# Assuming train_data and test_data are already loaded and preprocessed

# 2. Exploratory Analysis: Word Frequency
# Combining text for exploratory analysis
all_text = ' '.join(train_data[['benefitsReview', 'sideEffectsReview', 'commentsReview']].agg(' '.join, axis=1))
word_counts = Counter(all_text.split())
most_common_words = word_counts.most_common(30)
print("Most Common Words:", most_common_words)

words, counts = zip(*most_common_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.title('Top 30 Most Common Words')
plt.show()

# 3. One-Hot Encoding for Categorical Columns
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_features_train = encoder.fit_transform(train_data[['effectiveness', 'sideEffects', 'condition']])
encoded_features_test = encoder.transform(test_data[['effectiveness', 'sideEffects', 'condition']])

# 4. Separate TF-IDF Vectorization for Text Columns
tfidf_vectorizer = TfidfVectorizer(max_features=300)  # Adjusted for individual columns
benefits_tfidf_train = tfidf_vectorizer.fit_transform(train_data['benefitsReview'])
benefits_tfidf_test = tfidf_vectorizer.transform(test_data['benefitsReview'])

sideEffects_tfidf_train = tfidf_vectorizer.fit_transform(train_data['sideEffectsReview'])
sideEffects_tfidf_test = tfidf_vectorizer.transform(test_data['sideEffectsReview'])

comments_tfidf_train = tfidf_vectorizer.fit_transform(train_data['commentsReview'])
comments_tfidf_test = tfidf_vectorizer.transform(test_data['commentsReview'])

# 5. Combining All Features
train_features = hstack([benefits_tfidf_train, sideEffects_tfidf_train, comments_tfidf_train, encoded_features_train])
test_features = hstack([benefits_tfidf_test, sideEffects_tfidf_test, comments_tfidf_test, encoded_features_test])

# Now, train_features and test_features are ready for use in machine learning models.


In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(sentences):
    # Tokenize and encode sentences for BERT
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    
    # Move to the same device as the model
    encoded_input = encoded_input.to(model_bert.device)

    # Get embeddings
    with torch.no_grad():
        output = model_bert(**encoded_input)
    
    # Mean pool the token embeddings to get sentence embeddings
    return output['last_hidden_state'].mean(dim=1)

# Example usage
bert_embeddings = get_bert_embeddings(train_data['combined_text'].tolist())
