In [50]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

In [93]:
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
            "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've",
            "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "of",
            "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "shan't", "she", "she'd",
            "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
            "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
            "they've", "this", "those", "through", "to", "until", "up", "very", "was", "wasn't", "we", "we'd",
            "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's",
            "which", "while", "who", "who's", "whom", "with", "would", "you", "you'd", "you'll", "you're", "you've",
            "your", "yours", "yourself", "yourselves", "above", "again", "against", "aren't", "below", "but", "can't",
            "cannot", "couldn't", "didn't", "doesn't", "don't", "down", "few", "hadn't", "hasn't", "haven't", "if",
            "isn't", "mustn't", "no", "nor", "not", "off", "out", "over", "shouldn't", "same", "too", "under", "why",
            "why's", "won't", "wouldn't", ".", ";", ",", ":", "?", "!", "\"", "-", "'", "...",
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "...", "&"]

In [379]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
import re

tweetfile = pd.read_csv('train.csv', sep=',', na_values=["Not Available"]).dropna()
le_category = preprocessing.LabelEncoder()
# tweetfile.Category = le_category.fit_transform(tweetfile.Category)

# print(set(tweetfile.Category))
train_data, test_data = train_test_split(tweetfile, test_size=0.1, random_state=42)

traincorpus = list(train_data.Tweet)#[0:500]
traincategories = list(train_data.Category)#[0:500]
testcorpus = list(test_data.Tweet)
testcategories = list(test_data.Category)

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
stemmer = SnowballStemmer("english")
def tokenize_and_stem(tweet):
    ret = [stemmer.stem(word)
           for word in tknzr.tokenize(tweet)
           if not re.match('http[s]?', word)]
    return ret

vectorizer = TfidfVectorizer(min_df=2,
                             strip_accents='ascii',
                             use_idf=True,
                             stop_words=stopwords,
                             tokenizer=tokenize_and_stem)
Xtrain = vectorizer.fit_transform(traincorpus)
Xtest = vectorizer.transform(testcorpus)

In [355]:
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.metrics.classification import classification_report

classifier = RandomForestClassifier(random_state=89,
                                    n_jobs=8,
                                    class_weight='balanced_subsample',
                                    n_estimators=150,
                                    max_leaf_nodes=600,
                                    verbose=True)
svcfit = classifier.fit(Xtrain.toarray(), traincategories)
svcpred = classifier.predict(Xtest.toarray())

print(classification_report(testcategories, svcpred))

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.9s


             precision    recall  f1-score   support

      Tweet       0.00      0.00      0.00         1
   negative       0.44      0.43      0.44        74
    neutral       0.49      0.43      0.46       219
   positive       0.65      0.72      0.69       303

avg / total       0.57      0.58      0.57       597



[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:   11.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)


In [290]:
submissionfile = pd.read_csv('test.csv', dtype={'Id': object}, sep=',')
submissioncorpus = list(submissionfile.Category) # bad column name, that's actually tweets
Xsubm = vectorizer.transform(submissioncorpus)
svcpred_subm = classifier.predict(Xsubm.toarray())
submissionfile.Category = svcpred_subm
submissionfile.to_csv('subm_rf.csv', index=False)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.2s finished
