In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# General functions to run

Active learning Pipeline

# Datasets Evaluation

## IMDB Dataset

the dataset can be found here: https://www.kaggle.com/datasets/vishakhdapat/imdb-movie-reviews?resource=download

### First Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [None]:
# Ensure nltk resources are downloaded
nltk.download('stopwords')

# Load the dataset
imdb_data = pd.read_csv('../data/IMDB Dataset.csv')

# Sentiment Distribution
def sentiment_distribution(data):
    sentiment_counts = data['sentiment'].value_counts()
    plt.figure(figsize=(8, 6))
    sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='viridis')
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.show()

# Text Analysis
def text_analysis(data):
    # Review Length Distribution
    data['review_length'] = data['review'].apply(len)
    plt.figure(figsize=(8, 6))
    sns.histplot(data['review_length'], kde=True, color='blue')
    plt.title('Review Length Distribution')
    plt.xlabel('Review Length')
    plt.ylabel('Frequency')
    plt.show()

    # Most Common Words
    stop_words = set(stopwords.words('english'))
    words = ' '.join(data['review']).lower().split()
    words = [word for word in words if word.isalpha() and word not in stop_words]
    word_freq = Counter(words)
    common_words = word_freq.most_common(20)

    plt.figure(figsize=(10, 8))
    sns.barplot(x=[word[1] for word in common_words], y=[word[0] for word in common_words], palette='inferno')
    plt.title('Most Common Words')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()

    # Word Cloud for Positive and Negative Reviews
    positive_reviews = data[data['sentiment'] == 'positive']['review'].str.cat(sep=' ')
    negative_reviews = data[data['sentiment'] == 'negative']['review'].str.cat(sep=' ')

    wordcloud_positive = WordCloud(stopwords=stop_words, background_color='white', colormap='Blues').generate(positive_reviews)
    wordcloud_negative = WordCloud(stopwords=stop_words, background_color='white', colormap='Reds').generate(negative_reviews)

    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.imshow(wordcloud_positive, interpolation='bilinear')
    plt.title('Word Cloud for Positive Reviews')
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(wordcloud_negative, interpolation='bilinear')
    plt.title('Word Cloud for Negative Reviews')
    plt.axis('off')

    plt.show()

# Additional Insights
def additional_insights(data):
    # Average review length by sentiment
    avg_length_by_sentiment = data.groupby('sentiment')['review_length'].mean()
    plt.figure(figsize=(8, 6))
    sns.barplot(x=avg_length_by_sentiment.index, y=avg_length_by_sentiment.values, palette='coolwarm')
    plt.title('Average Review Length by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Average Review Length')
    plt.show()

In [None]:
sentiment_distribution(imdb_data)

In [None]:
text_analysis(imdb_data)

In [None]:
additional_insights(imdb_data)

### Active learning pipeline

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

# Load the CSV file
imdb_data = pd.read_csv('../data/IMDB Dataset.csv')

# Encode the target labels
imdb_data['sentiment'] = imdb_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Split data into a pool of unlabeled data and a small initial training set
X_pool, X_initial, y_pool, y_initial = train_test_split(imdb_data['review'], imdb_data['sentiment'], test_size=0.05, random_state=42)

# Vectorize the initial training set
X_initial_vectorized = vectorizer.fit_transform(X_initial).toarray()

# Initialize the model
model = LogisticRegression()

# Train the model on the initial training set
model.fit(X_initial_vectorized, y_initial)

# Define the number of iterations and the sample size for each iteration
iterations = 10
sample_size = 100

# Active Learning Loop
for i in range(iterations):
    # Randomly sample a subset from the pool
    sample_indices = random.sample(range(len(X_pool)), sample_size)
    X_sampled = X_pool.iloc[sample_indices]
    y_sampled = y_pool.iloc[sample_indices]

    # Vectorize the sampled data
    X_sampled_vectorized = vectorizer.transform(X_sampled).toarray()

    # Update the training data
    X_initial_vectorized = np.vstack((X_initial_vectorized, X_sampled_vectorized))
    y_initial = np.hstack((y_initial, y_sampled))

    # Shuffle the updated training data
    X_initial_vectorized, y_initial = shuffle(X_initial_vectorized, y_initial, random_state=42)

    # Retrain the model on the updated training set
    model.fit(X_initial_vectorized, y_initial)

    # Evaluate the model on a test set
    X_test_vectorized = vectorizer.transform(imdb_data['review']).toarray()
    y_test = imdb_data['sentiment']
    y_pred = model.predict(X_test_vectorized)

    # Output the current iteration's performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Iteration {i+1}/{iterations} - Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Remove the sampled instances from the pool
    X_pool = X_pool.drop(X_sampled.index)
    y_pool = y_pool.drop(y_sampled.index)

    # Check if the pool is empty
    if len(X_pool) == 0:
        print("No more data to sample from.")
        break

## Skin Cancer ISIC Dataset

the dataset can be found here: https://www.kaggle.com/datasets/nodoubttome/skin-cancer9-classesisic