In [1]:
import pandas as pd
import numpy as np
import pickle
import csv

---
## Data Loading

In [2]:
def load_data(filename):
    data = []
    for line in open(filename, "r"):
        data.append(line)
    return data

In [3]:
pos_tweets = pd.DataFrame(load_data('twitter-datasets/train_pos_full.txt'), columns=['tweet'])
pos_tweets['sentiment'] = 1

In [4]:
neg_tweets = pd.DataFrame(load_data('twitter-datasets/train_neg_full.txt'), columns=['tweet'])
neg_tweets['sentiment'] = -1

In [5]:
train_tweets = pd.concat([pos_tweets, neg_tweets], axis=0)

In [6]:
test_tweets = pd.DataFrame(load_data('twitter-datasets/test_data.txt'), columns=['tweet'])
test_tweets['tweet'] = test_tweets['tweet'].apply(lambda tweet: tweet.split(',', 1)[-1])

---
## Data Pre-processing

In [7]:
from preprocessing import tweets_preprocessing

In [9]:
train_tweets = tweets_preprocessing(train_tweets)

Contraction words expansion finished!
Emoji to hashtag transformation finished!
Sentiment words emphasizing finished!
Number to hashtag <number> transformation finished!
ALL DONE!


In [8]:
test_tweets = tweets_preprocessing(test_tweets)

Contraction words expansion finished!
Emoji to hashtag transformation finished!
Sentiment words emphasizing finished!
Number to hashtag <number> transformation finished!
ALL DONE!


In [16]:
train_tweets.to_pickle('train_tweets_after_preprocess.pkl')
test_tweets.to_pickle('test_tweets_after_preprocess.pkl')

---
## fastText Model Building and Prediction

In [None]:
def fast_text(tweets, test_tweets):
    tweets['sentiment'] = tweets['sentiment'].apply(lambda row: 'label_'+str(row))

    # Create fastText train file
    f = open('fasttext_train.txt', 'w')
    for tweet, sentiment in zip(tweets['tweet'], tweets['sentiment']):
        f.write((tweet.rstrip() + ' ' + sentiment + '\n'))
    f.close()
    
    # Train model
    classifier = fasttext.supervised('fasttext_train.txt', 'fasttext_model',
                                     label_prefix='label_', epoch = 20, dim = 200)
    
    test_tweets = np.array(test_tweets['tweet'])
    
    # Prediction
    labels = classifier.predict(test_tweets)
    labels = [int(value) for label in labels for value in label]

    return labels

In [11]:
pred = fast_text(train_tweets, test_tweets)
pred = np.array(pred)

In [12]:
with open('ft_submission.csv', 'w') as file:
    fieldnames = ['Id', 'Prediction']
    writeFile = csv.DictWriter(file, delimiter=",", fieldnames=fieldnames)
    writeFile.writeheader()
    idx = 1
    for x in pred:
        writeFile.writerow({'Id':int(idx),'Prediction':x})
        idx += 1