## Setting up model

In [3]:
import spacy
import pandas as pd
import pprint

from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_pickle("./data/data.pkl")
data = data[['text','class']]

In [7]:
data.info()
class_list = data['class'].unique()
print(class_list)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5362 entries, 0 to 5361
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5362 non-null   object
 1   class   5362 non-null   object
dtypes: object(2)
memory usage: 83.9+ KB
['neutral' 'bad' 'blocker' 'good']


## Splitting data set

For now, we will just train a model to predict the "point" attribute for a given sentence.

In [8]:
Train, Valid = train_test_split(data,test_size = 0.2)

Train = list(Train.to_records(index=False))
Valid = list(Valid.to_records(index=False))

In [9]:
pprint.pprint(Train)

[('undertake analysis and profiling of your browsing history and purchases (and any other data provided by you or on your behalf or which we collect on your behalf) in order to identify and inform you of products and services that we consider are likely to interest you;', 'blocker'),
 ('You are responsible for anything that happens through or involving your account until you close down your account or prove to Alignable’s reasonable satisfaction that your account security was compromised due to no fault of your own.', 'neutral'),
 ('In addition, we use <strong>"pixel tags"</strong> (also referred to as clear GIFs, Web beacons, or Web bugs).\nPixel tags are tiny graphic images with a unique identifier, similar in function to cookies, which are used to track online movements of Web users', 'bad'),
 ('YOU AGREE THAT WE CAN ONLY BRING A CLAIM AGAINST EACH OTHER ON AN INDIVIDUAL BASIS AND YOU AGREE NOT TO BRING OR PARTICIPATE IN A CLASS OR REPRESENTATIVE ACTION, PRIVATE ATTORNEY GENERAL ACT

 ('Like most websites you visit, our website uses cookies and other similar technologies to distinguish you from other users of our website.\nThis helps us to provide you with a good experience when you browse our website (such as storing and managing user preferences) and to improve the services we provide to you.\nOur website also uses cookies to deliver targeted advertising, enable functions of the website, and gather analytic and user data.\nBy using our website (through any device) you agree that this Cookies Policy, together with our Privacy Policy and our Terms and Conditions, applies to your use.', 'neutral'),
 ('Id Software’s Website and its products are not intended to be used by children.\nId Software does not knowingly solicit Personal Information from persons under the age of 13.\nIf you are under 13 years of age, you should not access, or attempt to obtain access to, QUAKE LIVE.', 'neutral'),
 ('We have no liability to you or any other person for loss, damage, or destruct

 ('4.\n<strong>What do we collect?</strong>\n</p>\n<p>4.1 Information which you upload to our service or otherwise give us such as your name, username, email address and other contact details as well as any personal information which you include in public activities on our service, e.g.\nforum posts.</p>\n<p>4.2 Your billing address as well as the last four digits and expiry date of your credit card, which may be sent to us by our payment providers.</p>\n<p>4.3 Automated information such as the internet protocol (IP) address used to connect your device to the internet, connection information such as browser type and version, information about your device including device-type and device identifier, operating system and platform, mobile network data, a unique reference number linked to the data you enter on our system, login details, the site from which you arrived at our service, details of your activity with date / time stamps including pages you visited and your searches / transactio

## Writing out to binary file

In [6]:
def make_docs(data):
    docs = []
    for doc, ann in nlp.pipe(data,as_tuples=True):
                
        if ann == "good":
            doc.cats['A'] = 0
            doc.cats['B'] = 0
        elif ann == "neutral":
            doc.cats['A'] = 1
            doc.cats['B'] = 0
        elif ann == "bad":
            doc.cats['A'] = 0
            doc.cats['B'] = 1
        elif ann == "blocker":
            doc.cats['A'] = 1
            doc.cats['B'] = 1
        else:
            print("Error")
        
        docs.append(doc)
    return (docs)

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
train_docs = make_docs(Train)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/train.spacy")

valid_docs = make_docs(Valid)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./data/valid.spacy")