In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.util import mark_negation
from nltk.corpus import sentiwordnet as swn
import numpy as np

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv("/content/drive/MyDrive/project_zzy/data/task-1/train.csv")
df.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0


In [None]:
import re

In [None]:
df['ori_word'] = df['original'].apply(lambda x : re.findall(r"<(.+?)/>",x)[0].lower())
df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,ori_word
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,isis
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,syria
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,coalition
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4,engage
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0,vice


In [None]:
df['original'] = df['original'].apply(lambda x : re.sub('<(.*)/>', '', x))
df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,ori_word
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,isis
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,syria
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,coalition
3,76,"In an apparent first , Iran and Israel each o...",slap,20000,0.4,engage
4,6164,Trump was told weeks ago that Flynn misled Pr...,school,0,0.0,vice


In [None]:
def get_entities(text):
  doc = nlp(text)
  return [(X.text, X.label_) for X in doc.ents]

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None




def process_text(text):
  punc_removed = [word for word in text if word not in string.punctuation]
  punc_removed = ''.join(punc_removed)
  punc_removed = word_tokenize(punc_removed)
  pos_tagged = nltk.pos_tag(punc_removed)
  vocabularies = []
  adjs = []
  for token, tag in pos_tagged:
      pos = get_wordnet_pos(tag) or wordnet.NOUN
      if pos == wordnet.ADJ:
        adjs.append(token.lower())
      lemmatizer = WordNetLemmatizer()
      token = lemmatizer.lemmatize(token, pos)
      if (token.lower() not in stopwords.words('english')):
          vocabularies.append(token.lower())
  return vocabularies,adjs

In [None]:
df['entities'] = df['original'].apply(get_entities)
df['vocabularies'] = df['original'].apply(process_text)
df[['vocabularies','adjs']] = df['vocabularies'].apply(pd.Series)
df.head()

Unnamed: 0,id,original,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,isis,"[(France, GPE), (Iraq, GPE)]","[france, ‘, hunt, citizen, join, ’, without, t...",[‘]
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,syria,"[(Pentagon, ORG), (2,000 %, PERCENT), (Russian...","[pentagon, claim, 2000, increase, russian, tro...",[russian]
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,coalition,"[(Iceland, GPE), (Furor Crashes, PERSON)]","[iceland, pm, calls, snap, vote, pedophile, fu...",[]
3,76,"In an apparent first , Iran and Israel each o...",slap,20000,0.4,engage,"[(first, ORDINAL), (Iran, GPE), (Israel, GPE)]","[apparent, first, iran, israel, militarily]","[apparent, first, other]"
4,6164,Trump was told weeks ago that Flynn misled Pr...,school,0,0.0,vice,"[(Trump, PERSON), (weeks ago, DATE), (Flynn, P...","[trump, tell, week, ago, flynn, mislead, presi...",[]


In [None]:
final_vocabularies = df[df['meanGrade'] < 1]['vocabularies']
final_vocabularies

0       [france, ‘, hunt, citizen, join, ’, without, t...
3             [apparent, first, iran, israel, militarily]
4       [trump, tell, week, ago, flynn, mislead, presi...
8              [canadians, may, pay, tax, americans, get]
9                 [dutch, minister, resigns, drug, baron]
                              ...                        
9646    [among, republicans, trump, popular, congressi...
9647    [state, official, blast, unprecedented, dhs, s...
9648    [protesters, rally, detained, jfk, airport, tr...
9649    [cruise, line, carnival, corp, join, fight, be...
9651                        [houseapproved, health, bill]
Name: vocabularies, Length: 4921, dtype: object

In [None]:
entities = df[['entities','adjs']]
entities[:10]

In [None]:
adjective = df[['adjs']]
adjective[:10]

In [None]:
def proprecess_ent(t_entities):
  entities = set()
  for ent in t_entities:
    if ent[1] == 'GPE':
      entities.add(ent[0].lower())
    elif ent[1] == "PERCENT":
      entities.add('percent')
    elif ent[1] == 'PERSON':
      entities.add('person')
    elif ent[1] == 'DATE':
      entities.add('date')
    elif ent[1] == 'ORG':
      entities.add('organization')
    elif ent[1] == 'PRODUCT':
      entities.add('product')
    elif ent[1] == 'LOC':
      entities.add('location')
    elif ent[1] == 'EVENT':
      entities.add('event')
    elif ent[1] == 'WORK_OF_ART':
      entities.add('art')
    elif ent[1] == 'LAW':
      entities.add('law')
    elif ent[1] == 'TIME':
      entities.add('time')
    elif ent[1] == 'MONEY':
      entities.add('money')
  return list(entities)

In [None]:
clusters = np.load('/content/drive/MyDrive/project_zzy/clusters.npy')

In [None]:
def cal_cos_sim(vector1, vector2):
  return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2)))

def get_mean_sim(M,clusters = clusters):
  if len(M.shape) == 1:
    return np.zeros([7])
  sim4culsters = []
  for i in range(clusters.shape[0]):
    temp = 0
    for j in range(M.shape[0]):
      vector1 = clusters[i]
      vector2 = M[j]
      temp += cal_cos_sim(vector1, vector2)
    sim4culsters.append(temp/M.shape[0])
  return np.array(sim4culsters)

In [None]:
df['entities'] = df['entities'].apply(proprecess_ent)
df[:10]

Unnamed: 0,id,original,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,isis,"[france, iraq]","[france, ‘, hunt, citizen, join, ’, without, t...",[‘]
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,syria,"[percent, organization]","[pentagon, claim, 2000, increase, russian, tro...",[russian]
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,coalition,"[person, iceland]","[iceland, pm, calls, snap, vote, pedophile, fu...",[]
3,76,"In an apparent first , Iran and Israel each o...",slap,20000,0.4,engage,"[israel, iran]","[apparent, first, iran, israel, militarily]","[apparent, first, other]"
4,6164,Trump was told weeks ago that Flynn misled Pr...,school,0,0.0,vice,"[person, date]","[trump, tell, week, ago, flynn, mislead, presi...",[]
5,8832,"All 22 Trump made in his speech to Congress ,...",sounds,22200,1.2,promises,"[product, organization]","[22, trump, make, speech, congress, one, chart]",[]
6,12174,New DOJ alert system will flag against police,laughter,32100,1.2,crimes,[],"[new, doj, alert, system, flag, police]",[]
7,3731,As Someone Who Grew Up Among Fundamentalist I...,morons,21110,1.0,christians,"[person, us]","[someone, grew, among, fundamentalist, us, sur...",[surprised]
8,6554,"Canadians may pay more taxes than Americans , ...",loonies,10000,0.2,money,[],"[canadians, may, pay, tax, americans, get]",[more]
9,14191,Dutch minister resigns in drug baron,blow,0,0.0,row,[],"[dutch, minister, resigns, drug, baron]",[dutch]


In [None]:
df.to_csv("generate_humor_data.csv")

In [None]:
df.drop(['original'],axis=1,inplace=True)
df.head()

Unnamed: 0,id,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs
0,14530,twins,10000,0.2,isis,"[france, iraq]","[france, ‘, hunt, citizen, join, ’, without, t...",[‘]
1,13034,bowling,33110,1.6,syria,"[percent, organization]","[pentagon, claim, 2000, increase, russian, tro...",[russian]
2,8731,party,22100,1.0,coalition,"[person, iceland]","[iceland, pm, calls, snap, vote, pedophile, fu...",[]
3,76,slap,20000,0.4,engage,"[israel, iran]","[apparent, first, iran, israel, militarily]","[apparent, first, other]"
4,6164,school,0,0.0,vice,"[person, date]","[trump, tell, week, ago, flynn, mislead, presi...",[]


In [None]:
# def sentence_2_vector(vab, embeddings=embeddings,emb_size=300):
#     words=[w for w in vab if w.isalpha() and w in embeddings]
#     if len(words)==0:
#         return np.hstack([np.zeros(emb_size)])
#     M=np.array([embeddings[w] for w in words])
#     max,min = find_maxcos_and_mincos(M)
#     return np.append(M.mean(axis=0),np.array([max,min]))

def snetence_2_vector(vab, embeddings=embeddings,emb_size=300):
    words=[w for w in vab if w.isalpha() and w in embeddings]
    if len(words)==0:
        return np.hstack([np.zeros(emb_size)])
    M=np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

def vabs_2_vector(vab, embeddings=embeddings,emb_size=300):
    words=[w for w in vab if w.isalpha() and w in embeddings]
    if len(words)==0:
        return np.hstack([np.zeros(emb_size)])
    M=np.array([embeddings[w] for w in words])
    return M

def word_2_vector(vab, embeddings=embeddings,emb_size=300):
  if vab in embeddings:
    return np.array(embeddings[vab])
  else:
    return np.hstack([np.zeros(emb_size)])

In [None]:
#寻找edit属于哪个label
def get_label(edit,clusters = clusters):
  label = 0
  min = cal_cos_sim(edit,clusters[0])
  for i in range(1,clusters.shape[0]):
    temp = cal_cos_sim(edit,clusters[i])
    if temp < min:
      label = i
      min = temp
  return label



In [None]:
globe_path = "/content/drive/My Drive/glove/glove.6B.300d.txt"
def load_word_embeddings(file=globe_path):
    embeddings={}
    with open(file,'r') as infile:
        for line in infile:
            values=line.split()
            embeddings[values[0]]=np.asarray(values[1:],dtype='float32')
    return embeddings
embeddings = load_word_embeddings()

In [None]:
df['vocabularies'] = df['vocabularies'].apply(snetence_2_vector)
df['entities'] = df['entities'].apply(vabs_2_vector)
df['adjs'] = df['adjs'].apply(vabs_2_vector)
df.head()

Unnamed: 0,id,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs
0,14530,twins,10000,0.2,isis,"[[-0.055966, 0.33098, -0.48706, -0.63245, 0.07...","[-0.123039864, 0.08765901, 0.07984271, -0.2401...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,13034,bowling,33110,1.6,syria,"[[-0.34248, 0.22911, 0.49966, 0.6407, -0.32319...","[-0.030953001, 0.28873172, -0.036028426, 0.171...","[[0.029215, 0.10225, 0.14988, -0.20514, -0.470..."
2,8731,party,22100,1.0,coalition,"[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.14419276, -0.07962062, -0.07726025, 0.02730...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,76,slap,20000,0.4,engage,"[[0.21257, -0.51255, 0.097078, 0.18691, -0.177...","[0.298504, 0.120728, 0.1639076, -0.323734, 0.0...","[[0.46376, 0.2722, 0.39632, -0.47637, -0.43986..."
4,6164,school,0,0.0,vice,"[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.006150999, 0.06871863, -0.1344733, 0.213557...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
df['ori_word'] = df['ori_word'].apply(word_2_vector)
df['edit'] = df['edit'].apply(word_2_vector)
df.head()

Unnamed: 0,id,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs
0,14530,"[0.11861, 0.63912, -0.643, 0.27125, 0.041054, ...",10000,0.2,"[0.32328, -0.18626, -0.36277, -0.2135, 0.54738...","[[-0.055966, 0.33098, -0.48706, -0.63245, 0.07...","[-0.123039864, 0.08765901, 0.07984271, -0.2401...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,13034,"[0.21417, -0.12899, 0.54156, 0.013344, 0.35649...",33110,1.6,"[0.49246, -0.40682, 0.25794, -0.27576, 0.17599...","[[-0.34248, 0.22911, 0.49966, 0.6407, -0.32319...","[-0.030953001, 0.28873172, -0.036028426, 0.171...","[[0.029215, 0.10225, 0.14988, -0.20514, -0.470..."
2,8731,"[-0.073121, -0.2013, 0.31239, -0.22547, -0.147...",22100,1.0,"[-0.20334, 0.020148, 0.69492, 0.0074181, -0.00...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.14419276, -0.07962062, -0.07726025, 0.02730...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,76,"[0.20881, 0.13581, -0.34811, 0.10243, -0.44111...",20000,0.4,"[0.070169, 0.40179, -0.11105, -0.61456, 0.2524...","[[0.21257, -0.51255, 0.097078, 0.18691, -0.177...","[0.298504, 0.120728, 0.1639076, -0.323734, 0.0...","[[0.46376, 0.2722, 0.39632, -0.47637, -0.43986..."
4,6164,"[-0.64161, 0.06824, 0.051487, 0.064869, 0.2172...",0,0.0,"[0.43693, 0.085203, 0.010649, -0.24189, 0.0404...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.006150999, 0.06871863, -0.1344733, 0.213557...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
df['entities_sim'] = df['entities'].apply(get_mean_sim)
df['adjs_sim'] = df['adjs'].apply(get_mean_sim)
df.head()

Unnamed: 0,id,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs,entities_sim,adjs_sim
0,14530,"[0.11861, 0.63912, -0.643, 0.27125, 0.041054, ...",10000,0.2,"[0.32328, -0.18626, -0.36277, -0.2135, 0.54738...","[[-0.055966, 0.33098, -0.48706, -0.63245, 0.07...","[-0.123039864, 0.08765901, 0.07984271, -0.2401...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.4557642311825252, -0.3271379698522151, -0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,13034,"[0.21417, -0.12899, 0.54156, 0.013344, 0.35649...",33110,1.6,"[0.49246, -0.40682, 0.25794, -0.27576, 0.17599...","[[-0.34248, 0.22911, 0.49966, 0.6407, -0.32319...","[-0.030953001, 0.28873172, -0.036028426, 0.171...","[[0.029215, 0.10225, 0.14988, -0.20514, -0.470...","[0.47334306052214403, -0.3555459367938077, -0....","[0.40724693969357967, -0.2807789847763731, -0...."
2,8731,"[-0.073121, -0.2013, 0.31239, -0.22547, -0.147...",22100,1.0,"[-0.20334, 0.020148, 0.69492, 0.0074181, -0.00...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.14419276, -0.07962062, -0.07726025, 0.02730...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.33369920410868104, -0.10853172872440946, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,76,"[0.20881, 0.13581, -0.34811, 0.10243, -0.44111...",20000,0.4,"[0.070169, 0.40179, -0.11105, -0.61456, 0.2524...","[[0.21257, -0.51255, 0.097078, 0.18691, -0.177...","[0.298504, 0.120728, 0.1639076, -0.323734, 0.0...","[[0.46376, 0.2722, 0.39632, -0.47637, -0.43986...","[0.4353441928085451, -0.2997730771442755, -0.1...","[0.5772985393982327, -0.27067907002881947, -0...."
4,6164,"[-0.64161, 0.06824, 0.051487, 0.064869, 0.2172...",0,0.0,"[0.43693, 0.085203, 0.010649, -0.24189, 0.0404...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.006150999, 0.06871863, -0.1344733, 0.213557...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.5044359084718961, -0.21070395103529221, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [None]:
df['label'] = df['edit'].apply(get_label)
df.head()

Unnamed: 0,id,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs,entities_sim,adjs_sim,label
0,14530,"[0.11861, 0.63912, -0.643, 0.27125, 0.041054, ...",10000,0.2,"[0.32328, -0.18626, -0.36277, -0.2135, 0.54738...","[[-0.055966, 0.33098, -0.48706, -0.63245, 0.07...","[-0.123039864, 0.08765901, 0.07984271, -0.2401...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.4557642311825252, -0.3271379698522151, -0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1
1,13034,"[0.21417, -0.12899, 0.54156, 0.013344, 0.35649...",33110,1.6,"[0.49246, -0.40682, 0.25794, -0.27576, 0.17599...","[[-0.34248, 0.22911, 0.49966, 0.6407, -0.32319...","[-0.030953001, 0.28873172, -0.036028426, 0.171...","[[0.029215, 0.10225, 0.14988, -0.20514, -0.470...","[0.47334306052214403, -0.3555459367938077, -0....","[0.40724693969357967, -0.2807789847763731, -0....",1
2,8731,"[-0.073121, -0.2013, 0.31239, -0.22547, -0.147...",22100,1.0,"[-0.20334, 0.020148, 0.69492, 0.0074181, -0.00...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.14419276, -0.07962062, -0.07726025, 0.02730...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.33369920410868104, -0.10853172872440946, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1
3,76,"[0.20881, 0.13581, -0.34811, 0.10243, -0.44111...",20000,0.4,"[0.070169, 0.40179, -0.11105, -0.61456, 0.2524...","[[0.21257, -0.51255, 0.097078, 0.18691, -0.177...","[0.298504, 0.120728, 0.1639076, -0.323734, 0.0...","[[0.46376, 0.2722, 0.39632, -0.47637, -0.43986...","[0.4353441928085451, -0.2997730771442755, -0.1...","[0.5772985393982327, -0.27067907002881947, -0....",5
4,6164,"[-0.64161, 0.06824, 0.051487, 0.064869, 0.2172...",0,0.0,"[0.43693, 0.085203, 0.010649, -0.24189, 0.0404...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[0.006150999, 0.06871863, -0.1344733, 0.213557...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.5044359084718961, -0.21070395103529221, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1


In [None]:
funny_df = df[df['meanGrade'] > 1.5]
Not_funny_df = df[df['meanGrade'] > 1]

In [None]:
Not_funny_df_vab = Not_funny_df[['vocabularies']][:100]
Not_funny_df_vab.head()

Unnamed: 0,vocabularies
1,"[-0.030953001, 0.28873172, -0.036028426, 0.171..."
5,"[-0.058314834, 0.007235167, -0.10722498, 0.191..."
6,"[-0.1539685, -0.19218601, -0.028522665, -0.080..."
13,"[-0.14930867, 0.17453599, 0.03545333, -0.01981..."
17,"[-0.008785009, 0.01604, -0.19752249, -0.173754..."


In [None]:
def getNpdata(df):
  data = df['ori_word'].values.tolist()
  d1 = np.array([x.tolist() for x in data])
  data = df['vocabularies'].values.tolist()
  d2 = np.array([x.tolist() for x in data])
  data = df['entities_sim'].values.tolist()
  d3 = np.array([x.tolist() for x in data])
  data = df['adjs_sim'].values.tolist()
  d4 = np.array([x.tolist() for x in data])
  all_data_1 = np.append(d1,d2,axis=1)
  all_data_2 = np.append(d3,d4,axis=1)
  all_data = np.append(all_data_1,all_data_2,axis=1)
  return all_data

In [None]:
funny_data = getNpdata(funny_df)
not_funny_data = getNpdata(Not_funny_df)

In [None]:
data = df['ori_word'].values.tolist()
d1 = np.array([x.tolist() for x in data])
data = df['vocabularies'].values.tolist()
d2 = np.array([x.tolist() for x in data])
data = df['entities_sim'].values.tolist()
d3 = np.array([x.tolist() for x in data])
data = df['adjs_sim'].values.tolist()
d4 = np.array([x.tolist() for x in data])

In [None]:
all_data_1 = np.append(d1,d2,axis=1)
all_data_2 = np.append(d3,d4,axis=1)
all_data = np.append(all_data_1,all_data_2,axis=1)
all_data.shape

(1708, 614)

In [None]:
label = df['label'].values.tolist()
label[0]

1

In [None]:
label = funny_df['label'].values.tolist()
label_np = np.array(label)

In [None]:
label_np = label_np.reshape(1708,1)

In [None]:
all_data_with_lable = np.append(funny_data, label_np ,axis=1)
all_data_with_lable.shape

(1708, 615)

In [None]:
np.save("ALL_trian_data.npy",all_data_with_lable)

In [None]:
def reshape_300(vec):
  data
  for i in range(data.shape[0]):
    data[i] = data[i].reshape(1,300)
  return data

In [None]:
data_ori = reshape_300(df['ori_word'])
data_vab = reshape_300(df['vocabularies'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
def reshape_7(vec):
  data = vec
  for i in range(data.shape[0]):
    data[i] = data[i].reshape(1,7)
  return data

In [None]:
data_ori = reshape_7(df['adjs_sim'])
data_vab = reshape_7(df['entities_sim'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
df.to_csv("generate_humor_train_data.csv")

In [None]:
train_df = df[df['meanGrade'] > 1.5]
train_df.head()

Unnamed: 0,id,edit,grades,meanGrade,ori_word,entities,vocabularies,adjs
1,13034,"[0.21417, -0.12899, 0.54156, 0.013344, 0.35649...",33110,1.6,"[0.49246, -0.40682, 0.25794, -0.27576, 0.17599...","[[-0.045708, -0.0040809, -0.17185, 0.15738, -0...","[-0.030953001, 0.28873172, -0.036028426, 0.171...","[[0.029215, 0.10225, 0.14988, -0.20514, -0.470..."
17,2357,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",22211,1.6,"[-0.19841, 0.35988, 0.37213, -0.16197, -0.6032...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.008785009, 0.01604, -0.19752249, -0.173754...","[[-0.22351, -0.010812, -0.11714, 0.091744, 0.2..."
20,8552,"[-0.30531, -0.20623, 0.13682, -0.22309, -0.048...",22211,1.6,"[-0.12675, 0.097588, 0.024441, 0.58839, 0.1570...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[-0.008440878, -0.13703513, -0.274488, 0.05146...","[[0.075965, -0.293, -0.31477, 0.1302, -0.1299,..."
47,11614,"[0.062218, -0.042259, 0.030933, 0.13953, -0.58...",33200,1.6,"[0.15883, -0.087638, 0.092703, -0.42416, -0.21...","[[-0.55598, 0.027967, -0.32289, -0.075042, -0....","[-0.185929, -0.20650749, 0.129554, -0.167867, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
52,13131,"[-0.36122, 0.31083, 0.16319, 0.217, -0.18337, ...",33111,1.8,"[0.46825, -0.55791, -0.21694, 0.36852, 0.30092...","[[0.13399, 0.95343, -0.021405, -0.44056, -0.31...","[0.1618298, 0.053095, 0.016567597, 0.10044254,...","[[0.080575, -0.15005, 0.014668, 0.49301, 0.182..."


In [None]:
humor_dic_df = pd.read_csv("/content/humor_dataset.csv")
humor_dic_df.head()

Unnamed: 0,word,mean,sd,n,mean_M,sd_M,n_M,mean_F,sd_F,n_F,mean_young,sd_young,n_young,mean_old,sd_old,n_old
0,abbey,2.292683,1.145511,41,2.176471,1.380004,17,2.347826,0.982052,23,2.391304,1.196173,23,2.166667,1.098127,18
1,abode,2.413793,1.118585,29,2.1,0.994429,10,2.578947,1.169795,19,2.692308,1.182132,13,2.1875,1.046821,16
2,abscess,1.59375,1.042929,32,1.625,1.187735,8,1.583333,1.017955,24,1.555556,1.041618,18,1.642857,1.081818,14
3,absence,1.64,0.95219,25,1.615385,0.960769,13,1.666667,0.984732,12,1.571429,0.851631,14,1.727273,1.103713,11
4,abstract,2.411765,1.281988,34,1.933333,1.032796,15,2.789474,1.357242,19,2.421053,1.121298,19,2.4,1.502379,15


In [None]:
humor_dic_df['vector'] = humor_dic_df['word'].apply(lambda x: embeddings.get(x))
humor_dic_df.drop(['sd','n','mean_M','sd_M','n_M','mean_F','sd_F','n_F','mean_young','sd_young','n_young','mean_old',\
'sd_old','n_old'],axis=1,inplace=True)
humor_dic_df.head()

Unnamed: 0,word,mean,vector
0,abbey,2.292683,"[0.15705, 0.18207, -0.27124, 0.15846, 0.482, -..."
1,abode,2.413793,"[0.14438, -0.62815, 0.059843, 0.29825, 0.19338..."
2,abscess,1.59375,"[-0.60272, -0.13241, -0.63239, -0.35504, 0.715..."
3,absence,1.64,"[0.035415, -0.063948, 0.0024058, -0.34112, 0.1..."
4,abstract,2.411765,"[-0.30829, 0.10864, -0.094104, 0.010129, 0.012..."


In [None]:
# traindata = []
humor_dic_df.drop(index=[2548,2976,3284],inplace=True)
# for index, row in humor_dic_df.iterrows():
#   # if row['vector'] is None:
#   #   print(row['word'], index)
#   # else:
#   traindata.append(row['vector'].tolist())
# traindata = np.array(traindata)

In [None]:
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=7)
s = clf.fit(traindata)

#每个样本所属的簇
humor_dic_df['label'] = clf.labels_


In [None]:
clusters = clf.cluster_centers_

In [None]:
np.save("clusters.npy",clusters)

In [None]:
humor_dic_df.to_csv('humor_dic.csv')

In [None]:
all_data_with_lable = np.load('/content/drive/MyDrive/project_zzy/ALL_trian_data.npy')

In [None]:
X_train = all_data_with_lable[:,:-1]
y_train = all_data_with_lable[:,-1]

In [None]:
y_train.shape

(1708,)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.1)

In [None]:
y_train

array([0., 1., 0., ..., 1., 5., 0.])

In [None]:
import xgboost as xgb

In [None]:
xgb_pars = {'learning_rate': 0.2,
          'max_depth': 5,
          'num_boost_round':20,
          'objective': 'multi:softmax',
          'random_state': 27,
          'silent':0,
          'num_class':7}

d_train = xgb.DMatrix(X_train, label=y_train)
d_val = xgb.DMatrix(X_test, label=y_test)
d_test = xgb.DMatrix(X_test)
watchlist = [(d_train, 'train'), (d_val, 'valid')]

bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-merror:0.363696	valid-merror:0.532164
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 50 rounds.
[50]	train-merror:0.076773	valid-merror:0.526316
Stopping. Best iteration:
[3]	train-merror:0.299284	valid-merror:0.450292



In [None]:
ans1 = bst.predict(xgb.DMatrix(not_funny_data))
ans1

array([1., 0., 0., ..., 0., 1., 0.], dtype=float32)

In [None]:
humor_dic = pd.read_csv("/content/drive/MyDrive/project_zzy/humor_dic.csv")
humor_dic.head()

Unnamed: 0.1,Unnamed: 0,word,mean,vector,label
0,0,abbey,2.292683,[ 0.15705 0.18207 -0.27124 0.15846 ...,5
1,1,abode,2.413793,[ 1.4438e-01 -6.2815e-01 5.9843e-02 2.9825e-...,1
2,2,abscess,1.59375,[-6.0272e-01 -1.3241e-01 -6.3239e-01 -3.5504e-...,4
3,3,absence,1.64,[ 3.5415e-02 -6.3948e-02 2.4058e-03 -3.4112e-...,0
4,4,abstract,2.411765,[-3.0829e-01 1.0864e-01 -9.4104e-02 1.0129e-...,6


In [None]:
humor_dic_df['label'] = humor_dic['label']

In [None]:
def findMostSimWord(label,index,df = df,dic = humor_dic_df):
  vec = df['ori_word'][index]
  max = -99999999
  word = ""
  for index,row in dic.iterrows():
    if row.label == label:
      if row.vector is not None:
        temp = cal_cos_sim(vec,row.vector)
      if temp > max:
        max = temp
        word = row.word
  return word

In [None]:
def findMostNotSimWord(label,df = Not_funny_df,dic = humor_dic_df):
  edit = []

  for index1,row1 in df.iterrows():
    vec = row1['ori_word']
    min = 99999999
    word = ""
    label = int(ans1[index1])
    for index,row in dic.iterrows():
      if row.label == label:
        if row.vector is not None:
          temp = cal_cos_sim(vec,row.vector)
        if temp < min:
          min = temp
          word = row.word
    edit.append(word)
    if len(edit) == 100:
      return edit
  return edit

In [None]:
edit_word = findMostNotSimWord(ans1,df = Not_funny_df,dic = humor_dic_df)

  


In [None]:
edit_word

In [None]:
a = findMostSimWord(1,46)
a

'army'

In [None]:
ff = final_vocabularies.tolist()

In [None]:
for i in range(len(edit_word)):
  if(edit_word[i] != ''):
    ff[i].append(edit_word[i])

In [None]:
c = {'vab':ff}
data=pd.DataFrame(c)

In [None]:
data.head()

Unnamed: 0,vab
0,"[france, ‘, hunt, citizen, join, ’, without, t..."
1,"[apparent, first, iran, israel, militarily, pa..."
2,"[trump, tell, week, ago, flynn, mislead, presi..."
3,"[canadians, may, pay, tax, americans, get, mou..."
4,"[dutch, minister, resigns, drug, baron, waddle]"


In [None]:
data = data[:100]

In [None]:
data.to_csv('testForhumor.csv')