In [None]:
import json

def process_Irish_Times(path_data='archive/new_IrishTimes_test.txt', path_labels='archive/new_IrishTimes_test_label.txt'):
    all_labels, all_data, irish_data = [], [], []
    with open(path_labels, 'r') as file:
        for line in file.readlines():
            all_labels.append(line.strip())
    with open (path_data, 'r') as file:
        for line in file.readlines():
            all_data.append(line.strip())
        for index, label in enumerate(all_labels):
            if label == 'sport':
                irish_data.append(all_data[index])
    return irish_data

def process_GOAL(path='goal/data/goal.json'):
    goal_data = []
    with open(path, 'r') as file:
        data = json.load(file)
    for sublist in data:
        subsublist = sublist['data']['commentary']
        for subsubsublist in subsublist:
            text = subsubsublist[1]
            best_sentence = sorted(text.split('.'), key=len, reverse=True)[0]
            goal_data.append(best_sentence)
    return goal_data

8102


In [67]:
import os

def process_Cornell(path):
    iterations = 0
    movie_data = []
    for filename in os.listdir(path):
        iterations += 1
        if filename.endswith('.txt'):
            filepath = os.path.join(path, filename)
            with open(filepath, 'r') as file:
                file_data = []
                for line in file.readlines():
                    if len(line) < 25:
                        continue
                    file_data.append(line.strip())
                movie_data.append(file_data)
        if iterations > 175:
            break

    movie_data = [item for sublist in movie_data for item in sublist]
    return movie_data

In [75]:
import pandas as pd
import random

def process_Amazon(path='Books_rating.csv'):
    book_data = []
    used_books = []
    first_1000 = pd.read_csv(path, nrows=160000)
    first_1000_list = first_1000.values.tolist()
    for sublist in first_1000_list:
        title = sublist[1]
        if title not in used_books:
            used_books.append(title)
            full_text = sublist[-1]
            full_text = full_text.split('.')
            random_idx = random.randint(a=0, b=len(full_text)-1)
            random_sentence = full_text[random_idx]
            #best_sentence = sorted(full_text, key=len, reverse=True)[0]
            book_data.append(random_sentence)
            #book_data.append(best_sentence)
    return book_data

In [None]:
def add_labels(dataset, label):
    labeled_dataset = []
    for instance in dataset:
        labeled = (instance, label)
        labeled_dataset.append(labeled)
    return labeled_dataset

def combine_datasets():
    irish_data = process_Irish_Times()
    goal_data = process_GOAL()
    pos_movie_data = process_Cornell('review_polarity/txt_sentoken/pos')
    neg_movie_data = process_Cornell('review_polarity/txt_sentoken/neg')
    sport_data = irish_data + goal_data
    movie_data = pos_movie_data + neg_movie_data
    book_data = process_Amazon()

    labeled_sports_data = add_labels(sport_data, 'sports')
    labeled_movie_data = add_labels(movie_data, 'movie')
    labeled_book_data = add_labels(book_data, 'book')

    final_dataset = labeled_sports_data + labeled_movie_data + labeled_book_data
    return final_dataset
    
full_dataset = combine_datasets()
print(full_dataset)

10722
10939
10893


In [77]:
import spacy
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

def preprocess(dataset):
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    stop_words = set(stopwords.words('english'))
    processed = []
    for (text, label) in dataset:
        tokens = simple_preprocess(text, deacc=True)
        tokens = [token for token in tokens if token not in stop_words]
        doc_out = nlp(' '.join(tokens))
        lemmatized = [token.lemma_ for token in doc_out if token.lemma_ not in stop_words and len(token.lemma_) > 2]
        processed.append((lemmatized, label))

    processed = [(' '.join(text) if isinstance(text, list) else text, label ) for text, label in processed]

    return processed

processed_dataset = preprocess(full_dataset)
print(processed_dataset)




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def model(dataset):
    texts = [x[0] for x in dataset]
    labels = [x[1] for x in dataset]

    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)


    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_vec, y_train)

    y_pred = clf.predict(X_test_vec)
    print(classification_report(y_test, y_pred))

    def predict(x):
        vec = vectorizer.transform([x])
        return clf.predict(vec)[0]

    with open("sentiment-topic-test.tsv", "r") as f:
        test_sentences = [line.strip() for line in f.readlines() if line.strip()]
        predicted_topics = [predict(s) for s in test_sentences]
    df = pd.read_csv("sentiment-topic-test.tsv", sep="\t")  # or sep="," if CSV
    print(df['topic'])
    print(predicted_topics)

model(processed_dataset)



              precision    recall  f1-score   support

        book       0.81      0.87      0.84      1092
       movie       0.85      0.83      0.84      1093
      sports       0.95      0.91      0.93      1071

    accuracy                           0.87      3256
   macro avg       0.87      0.87      0.87      3256
weighted avg       0.87      0.87      0.87      3256

0     sports
1     sports
2       book
3       book
4       book
5      movie
6      movie
7     sports
8     sports
9      movie
10      book
11      book
12     movie
13    sports
14    sports
15     movie
16     movie
17      book
Name: topic, dtype: object
['book', 'sports', 'sports', 'book', 'book', 'book', 'movie', 'movie', 'sports', 'book', 'movie', 'book', 'book', 'movie', 'movie', 'sports', 'movie', 'movie', 'book']
