In [1]:
import random
import numpy as np
import pandas as pd
random.seed(42)
np.random.seed(42)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import classification_report

# 1. Only use text data

In [2]:
data_path = 'dataset/convincing_data.csv'
data = pd.read_csv(data_path)
data['body_cleaned'] = data['body_cleaned'].apply(lambda x: x.strip('[]').replace("'",'').split(', '))
data ['text'] = data['body_cleaned'].apply(lambda x: ' '.join(x))
data = data[['id','text','sentiment']]

# 2. Sentiment threshold is +- 1/3

In [3]:
def score_label(score):
    if score > 1/3: return 1
    elif score < -1/3: return -1
    else: return 0
data['sentiment'] = data['sentiment'].apply(lambda x: score_label(x))

# 3. Four different train and test data

## 3.1 Only CountVectorizer

In [4]:
def split1(df):
    random.seed(42)
    np.random.seed(42)
    X = df['text']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    print("X_train.shape: ", X_train_vectorized.shape)
    print("X_test.shape: ", X_test_vectorized.shape)
    return X_train_vectorized, X_test_vectorized, y_train, y_test

## 3.2 Only TfidfVectorizer

In [5]:
def split2(df):
    random.seed(42)
    np.random.seed(42)
    X = df['text']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    print("X_train.shape: ", X_train_vectorized.shape)
    print("X_test.shape: ", X_test_vectorized.shape)
    return X_train_vectorized, X_test_vectorized, y_train, y_test

## 3.3 CountVectorizer+TruncatedSVD (Reduce dimension)

In [6]:
def split3(df, dimension):
    random.seed(42)
    np.random.seed(42)
    X = df['text']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    svd = TruncatedSVD(n_components=dimension)
    X_train = svd.fit_transform(X_train_vectorized)
    X_test = svd.transform(X_test_vectorized)
    print("X_train.shape: ", X_train.shape)
    print("X_test.shape: ", X_test.shape)
    plt.plot(np.cumsum(svd.explained_variance_ratio_))
    return X_train, X_test, y_train, y_test

## 3.4 TfidfVectorizer+Max_features (Reduce dimension)

In [7]:
def split4(df, dimension):
    random.seed(42)
    np.random.seed(42)
    X = df['text']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    vectorizer = TfidfVectorizer(max_features=dimension)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    print("X_train.shape: ", X_train_vectorized.shape)
    print("X_test.shape: ", X_test_vectorized.shape)
    return X_train_vectorized, X_test_vectorized, y_train, y_test

## 3.5 Function Usage

In [8]:
'''
x_train1, x_test1, y_train1, y_test1 = split1(data)
x_train2, x_test2, y_train2, y_test2 = split2(data)
x_train3, x_test3, y_train3, y_test3 = split3(data, 500)
x_train4, x_test4, y_train4, y_test4 = split4(data, 1000)
'''

'\nx_train1, x_test1, y_train1, y_test1 = split1(data)\nx_train2, x_test2, y_train2, y_test2 = split2(data)\nx_train3, x_test3, y_train3, y_test3 = split3(data, 500)\nx_train4, x_test4, y_train4, y_test4 = split4(data, 1000)\n'

# 4 Model evaluation

All store y_pred and use classification_report(y_test, y_pred)