# Baseline Model

This model basically draws the baseline for our model by making predictions solely based on the count of labelled positive words and negative words in a given tweet. If the number of positive words is greater than the negative, it can be concluded that the tweet is a positive tweet and vice versa. As a text file for both postive and negative words is provided along with the training and testing files, this baseline model relies entirley on the given sets. However, the results can be used to later judge the robustness of each machine learning model. 

In [1]:
import utils

In [2]:
TRAIN_PROCESSED_FILE = 'Train-processed.csv'
TEST_PROCESSED_FILE = 'Test-processed.csv'
POSITIVE_WORDS_FILE = 'positive-words.txt'
NEGATIVE_WORDS_FILE = 'negative-words.txt'
TRAIN = True

In [3]:
def file_to_wordset(filename):
    ''' Converts a file with a word per line to a Python set '''
    words = []
    with open(filename, 'r',encoding = 'utf-8') as f:
        for line in f:
            words.append(line.strip())
    return set(words)

In [4]:
def classify(processed_csv, test_file=True, **params):
    positive_words = file_to_wordset(params.pop('positive_words'))
    negative_words = file_to_wordset(params.pop('negative_words'))
    predictions = []
    with open(processed_csv, 'r',encoding = 'utf-8') as csv:
        for line in csv:
            if test_file:
                tweet_id, tweet = line.strip().split(',')
            else:
                tweet_id, label, tweet = line.strip().split(',')
            pos_count, neg_count = 0, 0
            for word in tweet.split():
                if word in positive_words:
                    pos_count += 1
                elif word in negative_words:
                    neg_count += 1
            # print pos_count, neg_count
            prediction = 1 if pos_count >= neg_count else 0
            if test_file:
                predictions.append((tweet_id, prediction))
            else:
                predictions.append((tweet_id, int(label), prediction))
    return predictions


In [5]:
#Baseline model accuracy on training data
TRAIN = True
predictions = classify(TRAIN_PROCESSED_FILE, test_file=(not TRAIN), positive_words=POSITIVE_WORDS_FILE, negative_words=NEGATIVE_WORDS_FILE)
correct = sum([1 for p in predictions if p[1] == p[2]]) * 100.0 / len(predictions)
print ('Correct = %.2f%%' % correct)

Correct = 20.03%


In [6]:
##Baseline model on test data

TRAIN = False

def save_results_to_csv(results, csv_file):
    ''' Save list of type [(tweet_id, positive)] to csv in Kaggle format '''
    with open(csv_file, 'w',encoding = 'utf-8') as csv:
        csv.write('id,prediction \n')
        for tweet_id, pred in results:
            csv.write(tweet_id)
            csv.write(',')
            csv.write(str(pred))
            csv.write('\n')
            
predictions = classify(TEST_PROCESSED_FILE, test_file=(not TRAIN), positive_words=POSITIVE_WORDS_FILE, negative_words=NEGATIVE_WORDS_FILE)
save_results_to_csv(predictions, 'baseline.csv')