In [1]:
import pandas as pd
import joblib
from flair.models import TextClassifier
from flair.data import Sentence 
import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from normalise import normalise
import numpy as np
nlp = en_core_web_sm.load()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [68]:
def preprocess_text(text):
    try:
        normalized_text = _normalize(text)
        doc = nlp(normalized_text)
        removed_punct = _remove_punct(doc)
        removed_stop_words = _remove_stop_words(removed_punct)
        txt = _lemmatize(removed_stop_words)
        txt = txt.lower()
        return txt
    except Exception as e:
        return
    

def _normalize(text):
    # some issues in normalise package
    try:
        return ' '.join(normalise(text, variety="BrE", user_abbrevs={}, verbose=False))
    except:
        return text

def _remove_punct(doc):
    return [t for t in doc if t.text not in string.punctuation]

def _remove_stop_words(doc):
    return [t for t in doc if not t.is_stop]

def _lemmatize(doc):
    return ' '.join([t.lemma_ for t in doc])


def glower(text):
    try:
        text =  text.lower()
        return text
    except Exception as e:
        return None

In [55]:
#data = joblib.load('./list_of_shortened_files.pkl')
shorten = []
for i in range(1,512):
    with open('./shortened/{}.txt'.format(i), 'r+', encoding="utf8") as f:
        shorten.append(''.join(f.readlines()))

In [51]:
classifier = TextClassifier.load('en-sentiment')

2020-05-03 01:21:08,837 loading file C:\Users\ishan\.flair\models\imdb-v0.4.pt


In [52]:

sentence = Sentence('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

Sentence above is:  [POSITIVE (0.6636102795600891)]


In [53]:
sentence.labels[0].to_dict()

{'value': 'POSITIVE', 'confidence': 0.6636102795600891}

In [56]:
len(shorten)

511

In [57]:
full = pd.read_csv('./China_withOnlyFullText.csv', encoding='utf-8')
full.head()

Unnamed: 0,s_num,score,eastern/western,Headlines,Context,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,1,-1.0,W,This crisis has taught us the true cost of doi...,Public health is not the only area in which Ch...,-13,,
1,2,-1.0,W,The forgotten victims of China’s Belt and Road...,But one group of victims is often overlooked: ...,,,
2,3,-1.0,W,China’s debt traps around the world are a trad...,"But the 93-year-old leader, who recaptured the...",,,
3,4,-1.0,W,Why countries might want out of China’s Belt a...,"“The future’s coming now,” a group of children...",gave good projects too,,
4,5,0.0,W,Silk Road forum showcases China’s power — and ...,"The videos are so cringe-inducing, so ridiculo...",,-1.0,


In [58]:
head_shorten = []
for i,j in zip(full['Headlines'].tolist(), shorten):
    try:
        head_shorten.append(i+'. '+j) 
    except: 
        head_shorten.append('-') 

cols = []
for i in ['short', 'full', 'head']:
    cols.extend([i,i+'_label',i+'_score'])

In [59]:
df = pd.DataFrame(columns=cols)
def get_score(text):
    try:
        sentence = Sentence(text)
        classifier.predict(sentence)
        score = sentence.labels[0].to_dict()
        return score['confidence']
    except:
        return None

def get_value(text):
    try:
        sentence = Sentence(text)
        classifier.predict(sentence)
        score = sentence.labels[0].to_dict()
        return score['value']
    except:
        return None
    
df.head()

Unnamed: 0,short,short_label,short_score,full,full_label,full_score,head,head_label,head_score


In [69]:
df['head'] = full['Headlines'].tolist()
#df['head_short'] = head_shorten
df['full'] = full['Context']
#shorten = shorten.extend([' ']*(len(df['full'])-len(shorten)))
for i in range(len(shorten)):
    df['short'].iloc[i] = shorten[i]

In [71]:
df['short'] = df['short'].apply(preprocess_text)
df['full'] = df['full'].apply(preprocess_text)
df['head'] = df['head'].apply(preprocess_text)

In [72]:
df.head()

Unnamed: 0,short,short_label,short_score,full,full_label,full_score,head,head_label,head_score
0,china particularly interested share wealth rai...,,,public health area china ’s insistence play no...,,,crisis teach true cost business china,,
1,pacific partnership right answer china 's stat...,,,group victim overlook chinese worker dispatch ...,,,forget victim china ’s belt road initiative,,
2,exception united state address issue push forw...,,,93-year old leader recapture premiership year ...,,,china ’s debt trap world trademark imperialist...,,
3,president xi jinpe china evoke camel carava...,,,""" future ’s come "" group child sing clip "" bel...",,,country want china ’s belt road,,
4,china long close strategic partner pakistan pr...,,,video cringe inducing ridiculously saccharine ...,,,silk road forum showcase china ’s power — powe...,,


In [None]:
for i in ['short', 'full', 'head']:
    df[i+'_label'] = df[i].apply(get_value)
    df[i+'_score'] = df[i].apply(get_score)

In [None]:
df.head()

In [None]:
df.to_csv('./sentiment_data.csv', encoding='utf-8')