## Part A: Subreddit Prediction ##

In [1]:
subreddit_train = "coursework_subreddit_train.json"
subreddit_test = "coursework_subreddit_test.json"

#!gsutil cp gs://textasdata/coursework/coursework_subreddit_train.json $subreddit_train 
#!gsutil cp gs://textasdata/coursework/coursework_subreddit_test.json  $subreddit_test

In [2]:
import pandas as pd

train_threads = pd.read_json(path_or_buf=subreddit_train, lines=True)
print(list(train_threads.columns.values))
print(train_threads.head())
print(train_threads.size)

['is_self_post', 'posts', 'subreddit', 'title', 'url']
   is_self_post                                              posts  \
0           1.0  [{'body': 'I think everyone has that one frien...   
1           1.0  [{'body': 'I not 100% sure this is the right p...   
2           1.0  [{'body': '', 'author': 'Leisure321', 'url': '...   
3           1.0  [{'body': 'It's called 'forgetting things'.', ...   
4           1.0  [{'body': 'How would I do this? I am looking t...   

        subreddit                                              title  \
0   relationships  How do I [23F] communicate with my self-center...   
1  summonerschool  What Cherry switch do you recommend for League...   
2       askreddit                   Where do memes go when they die?   
3           trees                     Some weird long term affects??   
4        buildapc  Simple question: If I install Windows to a sta...   

                                                 url  
0  https://www.reddit.com/r/relation

In [3]:
test_threads = pd.read_json(path_or_buf=subreddit_test, lines=True)
print(test_threads.head())
print(test_threads.size)

   is_self_post                                              posts  \
0           1.0  [{'body': 'Was watching a VOD from last years ...   
1           1.0  [{'body': 'Basically what the title says.', 'u...   
2           1.0  [{'body': '', 'author': 'Daft-Punk', 'url': 'h...   
3           1.0  [{'body': 'I start running this year. I do it ...   
4           1.0  [{'body': '[deleted]', 'url': 'https://www.red...   

       subreddit                                              title  \
0      starcraft  Just a reminder on how much SC2 has evolved th...   
1    whowouldwin  Your Favorite Hero Now Has A Healing Factor As...   
2      askreddit  If you could live anywhere in the world, where...   
3      askreddit                   Do you ever get use to exercise?   
4  tipofmytongue         [TOMT] [book] A scary french book for kids   

                                                 url  
0  https://www.reddit.com/r/starcraft/comments/mq...  
1  https://www.reddit.com/r/whowouldwin/co

In [4]:
subreddit_counts = train_threads['subreddit'].value_counts()
print(subreddit_counts.describe())
top_subbreddits = subreddit_counts.nlargest(20)
top_subbreddits_list = top_subbreddits.index.tolist()
print(top_subbreddits)

count     20.000000
mean      72.800000
std       73.368285
min       28.000000
25%       36.250000
50%       45.500000
75%       63.750000
max      334.000000
Name: subreddit, dtype: float64
askreddit               334
leagueoflegends         196
buildapc                131
explainlikeimfive        82
trees                    66
techsupport              63
pcmasterrace             62
gaming                   62
electronic_cigarette     59
relationships            48
tipofmytongue            43
hearthstone              38
jailbreak                38
summonerschool           37
atheism                  37
reddit.com               34
whowouldwin              33
movies                   33
personalfinance          32
starcraft                28
Name: subreddit, dtype: int64


In [5]:
train_labels = train_threads['subreddit']
test_labels = test_threads['subreddit']

In [6]:
# Initialise nlp
import spacy
#!python -m spacy download en_core_web_md

nlp = spacy.load('/usr/lib/python3.7/site-packages/en_core_web_md/en_core_web_md-2.0.0', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.DependencyParser at 0x7f22042fda98>)

In [7]:
# Get Stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/stuart/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Define tokenizer and normalizer

#@Tokenize
def spacy_tokenize(string):
    tokens = list()
    doc = nlp(string)
    for token in doc:
        tokens.append(token)
    return tokens

#@Normalize
def normalize(tokens):
    normalized_tokens = list()
    for token in tokens:
        normalized = token.text.lower().strip()
        if ((token.is_alpha or token.is_digit)):
          normalized_tokens.append(normalized)
    return normalized_tokens

#@Tokenize and normalize
def tokenize_normalize(string):
    return normalize(spacy_tokenize(string))

In [35]:
# Create vectorizers

from sklearn.feature_extraction.text import CountVectorizer 
one_hot_vectorizer = CountVectorizer(tokenizer=tokenize_normalize, binary=True)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_normalize)

In [31]:
# Create Classifiers

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

most_freq_class = DummyClassifier(strategy="most_frequent")
rand_class = DummyClassifier(strategy="stratified")
log_reg_class = LogisticRegression(solver="saga")
svc_class = SVC(kernel="rbf")
ber_nb_class = BernoulliNB() # Chosen classifier

In [11]:
def expand_posts(threads):
    """ Lines below explained in order:
    Split posts column into new columns
    Merge the back into train_threads
    Drop the unexpanded posts
    Transform the numeric rows from the split into separate rows
    Remove variable column left from melt
    Remove all rows with NaN
    """
    expanded_posts = train_threads.posts.apply(pd.Series)\
    .merge(train_threads.rename(columns={'url': 'thread_url'}), left_index=True, right_index=True)\
    .drop(["posts"], axis=1)\
    .melt(id_vars=["is_self_post", "subreddit", "title", "thread_url"], value_name="post")\
    .drop("variable", axis=1)\
    .dropna()

    # Expand the post dictionary into columns and remove invalid posts
    return pd.concat([
        expanded_posts.drop(['post'], axis=1),
        expanded_posts['post'].apply(pd.Series)
    ], axis=1)\
.dropna(subset=['body', 'author'])

In [12]:
# Restructure threads into posts
train_posts = expand_posts(train_threads)
test_posts = expand_posts(test_threads)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    

In [33]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

def evaluation_summary(description, predictions, true_labels):
    print("Evaluation for: " + description)
    precision = precision_score(predictions, true_labels)
    recall = recall_score(predictions, true_labels)
    accuracy = accuracy_score(predictions, true_labels)
    f1 = fbeta_score(predictions, true_labels, 1) #1 means f_1 measure
    print("Classifier '%s' has Acc=%0.3f P=%0.3f R=%0.3f F1=%0.3f" % (description,accuracy,precision,recall,f1))
    print(classification_report(predictions, true_labels, digits=3))
    print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions))

In [26]:
from sklearn.pipeline import Pipeline, FeatureUnion 
one_hot_pipeline = Pipeline([
    ("union", FeatureUnion([
        ("title", Pipeline([
            ("select", ItemSelector('title')),
            ("vec", one_hot_vectorizer)
        ])),
        ("body", Pipeline([
            ("select", ItemSelector('body')),
            ("vec", one_hot_vectorizer)
        ])),
        ("author", Pipeline([
            ("select", ItemSelector('author')),
            ("vec", one_hot_vectorizer)
        ])),
    ]))
])
train_one_hot_features = one_hot_pipeline.fit_transform(train_posts)
test_one_hot_features = one_hot_pipeline.transform(test_posts)

In [27]:
tfidf_pipeline = Pipeline([
    ("union", FeatureUnion([
        ("title", Pipeline([
            ("select", ItemSelector('title')),
            ("vec", tfidf_vectorizer)
        ])),
        ("body", Pipeline([
            ("select", ItemSelector('body')),
            ("vec", tfidf_vectorizer)
        ])),
        ("author", Pipeline([
            ("select", ItemSelector('author')),
            ("vec", tfidf_vectorizer)
        ])),
    ]))
])
train_tfidf_features = tfidf_pipeline.fit_transform(train_posts)
test_tfidf_features = tfidf_pipeline.transform(test_posts)

In [36]:
most_freq_class.fit(train_one_hot_features, train_labels)
evaluation_summary("One-Hot Most Freq", most_freq_class.predict(test_one_hot_features), test_labels)

Evaluation for: One-Hot Most Freq


ValueError: Target is multiclass but average='binary'. Please choose another average setting.

In [15]:
# OneHot title features
one_hot_vectorizer.fit(train_posts['title'])
train_title_one_hot_features = one_hot_vectorizer.transform(train_posts['title'])
test_title_one_hot_features = one_hot_vectorizer.transform(test_posts['title'])

# OneHot body features
one_hot_vectorizer.fit(train_posts['body'])
train_body_one_hot_features = one_hot_vectorizer.transform(train_posts['body'])
test_body_one_hot_features = one_hot_vectorizer.transform(test_posts['body'])

# OneHot author features
one_hot_vectorizer.fit(train_posts['author'])
train_author_one_hot_features = one_hot_vectorizer.transform(train_posts['author'])
test_author_one_hot_features = one_hot_vectorizer.transform(test_posts['author'])

In [18]:
# Tfidf title features
tfidf_vectorizer.fit(test_posts['title'])
train_title_tfidf_features = tfidf_vectorizer.transform(train_posts['title'])
test_title_tfidf_features = tfidf_vectorizer.transform(test_posts['title'])

# Tfidf body features
tfidf_vectorizer.fit(train_posts['body'])
train_body_tfidf_features = tfidf_vectorizer.transform(train_posts['body'])
test_body_tfidf_features = tfidf_vectorizer.transform(test_posts['body'])

# Tfidf author features
tfidf_vectorizer.fit(train_posts['author'])
train_author_tfidf_features = tfidf_vectorizer.transform(train_posts['author'])
test_author_tfidf_features = tfidf_vectorizer.transform(test_posts['author'])

## Part B: Discourse prediction ##

In [19]:
discourse_train = "coursework_discourse_train.json"
discourse_test = "coursework_discourse_test.json"
  
!gsutil cp gs://textasdata/coursework/coursework_discourse_train.json $discourse_train  
!gsutil cp gs://textasdata/coursework/coursework_discourse_test.json  $discourse_test

Copying gs://textasdata/coursework/coursework_discourse_train.json...
\ [1 files][ 60.2 MiB/ 60.2 MiB]                                                
Operation completed over 1 objects/60.2 MiB.                                     
Copying gs://textasdata/coursework/coursework_discourse_test.json...
| [1 files][ 15.1 MiB/ 15.1 MiB]                                                
Operation completed over 1 objects/15.1 MiB.                                     


In [20]:
# The reddit thread structure is nested with posts in a new content.
# This block reads the file as json and creates a new data frame.
import pandas as pd
import json

def load_posts(file):
  # A temporary variable to store the list of post content.
  posts_tmp = list()

  with open(file) as jsonfile:
    for i, line in enumerate(jsonfile):
     # if (i > 2): break
      thread = json.loads(line)
      for post in thread['posts']:
        # NOTE: This could be changed to use additional features from the post or thread.
        # DO NOT change the labels for the test set.
        posts_tmp.append((thread['subreddit'], thread['title'], thread['url'],
                        post['id'], post.get('author', ""), post.get('body', ""), post.get("majority_link", ""), 
                        post.get('post_depth', 0), post.get('majority_type', ""), # discourse type label 
                        post.get('in_reply_to', "") ))

# Create the posts data frame.  
  labels = ['subreddit', 'title', 'url', 'id', 'author', 'body', 'majority_link', 
          'post_depth', 'discourse_type', 'in_reply_to']
  return pd.DataFrame(posts_tmp, columns=labels)

In [21]:
train_posts = load_posts(discourse_train)
# Filter out empty labels
train_posts = train_posts[train_posts['discourse_type'] != ""]
print(train_posts.head())
print("Num posts: ", train_posts.size)

    subreddit                           title  \
0  worldofpvp  Help me decide my new PvP main   
1  worldofpvp  Help me decide my new PvP main   
2  worldofpvp  Help me decide my new PvP main   
3  worldofpvp  Help me decide my new PvP main   
4  worldofpvp  Help me decide my new PvP main   

                                                 url          id  \
0  https://www.reddit.com/r/worldofpvp/comments/2...   t3_2v0anq   
1  https://www.reddit.com/r/worldofpvp/comments/2...  t1_codb2p9   
2  https://www.reddit.com/r/worldofpvp/comments/2...  t1_codg0we   
3  https://www.reddit.com/r/worldofpvp/comments/2...  t1_coeatsq   
4  https://www.reddit.com/r/worldofpvp/comments/2...  t1_codbyit   

         author                                               body  \
0      TyrickEU  Hi. \nAs a raider previously, i had no problem...   
1          vurt  [deleted]  \n ^^^^^^^^^^^^^^^^0.5422 \n > [Wha...   
2   OptimusNice  This goes mostly for 3v3 since that seems to b...   
3               

The label for the post we will be predicting is in the discourse_type column.

In [22]:
test_posts = load_posts(discourse_test)
# Filter out empty labels
test_posts = test_posts[test_posts['discourse_type'] != ""]
print("Num posts: ", test_posts.size)


Num posts:  198120


In [23]:
train_labels = train_posts['discourse_type']
test_labels = test_posts['discourse_type']

Examine the distribution over labels on the training data.

In [24]:
discourse_counts = train_labels.value_counts()
print(discourse_counts.describe())

top_discourse = discourse_counts.nlargest(200)
print(top_discourse)
top_discourse = top_discourse.index.tolist()
print(top_discourse)

count       10.000000
mean      7926.700000
std       9664.321866
min       1266.000000
25%       1671.500000
50%       3235.500000
75%      11919.750000
max      31419.000000
Name: discourse_type, dtype: float64
answer              31419
elaboration         14775
question            13610
appreciation         6849
agreement            3868
disagreement         2603
humor                1787
other                1633
announcement         1457
negativereaction     1266
Name: discourse_type, dtype: int64
['answer', 'elaboration', 'question', 'appreciation', 'agreement', 'disagreement', 'humor', 'other', 'announcement', 'negativereaction']


In [25]:
# Define the features
X_train = #features from training data
X_test = #features from test data

from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='stratified',random_state=0)
clf.fit(X_train, train_posts['discourse_type'])
predictions = clf.predict(X_test)  
print(classification_report(predictions, test_posts['discourse_type']))

SyntaxError: invalid syntax (<ipython-input-25-916b3084450d>, line 2)