In [1]:
pip install numpy scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

# Sample training data
papers = [
    "COVID vaccine clinical trials",           # Medical
    "vaccine efficacy study results",          # Medical
    "psychological impact of lockdown",        # Psychology
    "anxiety depression treatment study",      # Psychology
    "vaccine side effects analysis",           # Medical
    "mental health during pandemic",           # Psychology
    "antibody response in patients",           # Medical
    "cognitive behavioral therapy research"     # Psychology
]

# Labels: 1 for Medical, 0 for Psychology
labels = np.array([1, 1, 0, 0, 1, 0, 1, 0])

# Convert text to numbers using simple word counting
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(papers)

# Train the model
model = LogisticRegression()
model.fit(X, labels)

# Function to predict journal and show reasoning
def predict_paper(paper_text):
    # Convert paper text to numbers
    paper_features = vectorizer.transform([paper_text])
    
    # Get prediction and probability
    prediction = model.predict(paper_features)[0]
    probabilities = model.predict_proba(paper_features)[0]
    
    # Get important words and their weights
    feature_names = vectorizer.get_feature_names_out()
    weights = model.coef_[0]
    word_weights = dict(zip(feature_names, weights))
    
    # Print results
    print(f"\nPaper text: '{paper_text}'")
    print(f"Predicted journal: {'Medical' if prediction == 1 else 'Psychology'}")
    print(f"Confidence: Medical={probabilities[1]:.2%}, Psychology={probabilities[0]:.2%}")
    
    # Show top words that influenced the decision
    print("\nTop words influencing the decision:")
    for word in paper_text.split():
        if word in word_weights:
            print(f"- '{word}': weight={word_weights[word]:.2f}")

# Test some examples
test_papers = [
    "new vaccine development study",
    "depression and anxiety research",
    "COVID treatment clinical trial"
]

for paper in test_papers:
    predict_paper(paper)


Paper text: 'new vaccine development study'
Predicted journal: Medical
Confidence: Medical=62.91%, Psychology=37.09%

Top words influencing the decision:
- 'vaccine': weight=0.71
- 'study': weight=-0.01

Paper text: 'depression and anxiety research'
Predicted journal: Psychology
Confidence: Medical=27.75%, Psychology=72.25%

Top words influencing the decision:
- 'depression': weight=-0.27
- 'anxiety': weight=-0.27
- 'research': weight=-0.24

Paper text: 'COVID treatment clinical trial'
Predicted journal: Medical
Confidence: Medical=50.25%, Psychology=49.75%

Top words influencing the decision:
- 'treatment': weight=-0.27
- 'clinical': weight=0.23


In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

class SimpleJournalClassifier:
    def __init__(self, learning_rate=0.1):
        self.learning_rate = learning_rate
        self.vectorizer = CountVectorizer()
        self.weights = None
        self.bias = 0
        
    def train_one_example(self, paper, true_label, print_details=True):
        # Convert paper text to word counts
        features = self.vectorizer.transform([paper]).toarray()[0]
        print(f"\nFeatures for paper '{paper}':" + "\n" + str(features))
        
        # Make prediction with current weights
        prediction = np.dot(features, self.weights) + self.bias
        predicted_label = 1 if prediction > 0 else 0
        
        if print_details:
            print(f"\nPaper: '{paper}'")
            print(f"True label: {'Medical' if true_label == 1 else 'Psychology'}")
            print(f"Predicted: {'Medical' if predicted_label == 1 else 'Psychology'}")
            
            # Show current weights for words in this paper
            print("\nCurrent word weights:")
            words = paper.split()
            for word in words:
                if word in self.vectorizer.vocabulary_:
                    idx = self.vectorizer.vocabulary_[word]
                    print(f"- '{word}': {self.weights[idx]:.2f}")
        
        # If prediction is wrong, adjust weights
        if predicted_label != true_label:
            error = true_label - predicted_label
            
            # Update weights for each word
            self.weights += self.learning_rate * error * features
            self.bias += self.learning_rate * error
            
            if print_details:
                print("\nPrediction was wrong! Adjusting weights...")
                print("New word weights:")
                for word in words:
                    if word in self.vectorizer.vocabulary_:
                        idx = self.vectorizer.vocabulary_[word]
                        print(f"- '{word}': {self.weights[idx]:.2f}")
        
        return predicted_label == true_label

    def train(self, papers, labels, epochs=3):
        # First, build vocabulary and convert to number format
        self.vectorizer.fit(papers)
        
        # Initialize weights to small random values
        vocab_size = len(self.vectorizer.vocabulary_)
        self.weights = np.random.randn(vocab_size) * 0.01
        
        print("Initial random weights:")
        print(dict(zip(self.vectorizer.get_feature_names_out(), self.weights.round(2))))
        
        # Training loop
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}:")
            correct = 0
            for paper, label in zip(papers, labels):
                if self.train_one_example(paper, label):
                    correct += 1
            print(f"\nAccuracy this epoch: {correct/len(papers):.0%}")

# Training data
papers = [
    "vaccine clinical trials",           # Medical
    "depression therapy study",          # Psychology
    "vaccine patient response",          # Medical
    "anxiety treatment research"         # Psychology
]

labels = [1, 0, 1, 0]  # 1 for Medical, 0 for Psychology

# Create and train classifier
classifier = SimpleJournalClassifier(learning_rate=0.1)
classifier.train(papers, labels)

# Test the final model
print("\nTesting final model:")
test_papers = [
    "new vaccine study",
    "depression analysis"
]

for paper in test_papers:
    features = classifier.vectorizer.transform([paper]).toarray()[0]
    prediction = np.dot(features, classifier.weights) + classifier.bias
    print(f"\nPaper: '{paper}'")
    print(f"Prediction: {'Medical' if prediction > 0 else 'Psychology'}")
    
    # Show weights of words in test paper
    print("Word weights that influenced this decision:")
    for word in paper.split():
        if word in classifier.vectorizer.vocabulary_:
            idx = classifier.vectorizer.vocabulary_[word]
            print(f"- '{word}': {classifier.weights[idx]:.2f}")

Initial random weights:
{'anxiety': 0.01, 'clinical': 0.02, 'depression': -0.01, 'patient': -0.01, 'research': 0.0, 'response': 0.01, 'study': -0.01, 'therapy': 0.02, 'treatment': 0.01, 'trials': -0.01, 'vaccine': -0.0}

Epoch 1:

Features for paper 'vaccine clinical trials':
[0 1 0 0 0 0 0 0 0 1 1]

Paper: 'vaccine clinical trials'
True label: Medical
Predicted: Psychology

Current word weights:
- 'vaccine': -0.00
- 'clinical': 0.02
- 'trials': -0.01

Prediction was wrong! Adjusting weights...
New word weights:
- 'vaccine': 0.10
- 'clinical': 0.12
- 'trials': 0.09

Features for paper 'depression therapy study':
[0 0 1 0 0 0 1 1 0 0 0]

Paper: 'depression therapy study'
True label: Psychology
Predicted: Medical

Current word weights:
- 'depression': -0.01
- 'therapy': 0.02
- 'study': -0.01

Prediction was wrong! Adjusting weights...
New word weights:
- 'depression': -0.11
- 'therapy': -0.08
- 'study': -0.11

Features for paper 'vaccine patient response':
[0 0 0 1 0 1 0 0 0 0 1]

Paper: