In [46]:
import os
from opencc import OpenCC
from pycorenlp import StanfordCoreNLP
from collections import Counter
import random
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec
import nltk
from nltk.collocations import *
import time
import jieba
jieba.set_dictionary('/Users/Wan-Ting/Google Drive/NCTU/NLP/finalproj/others_work/dict.txt.big1.txt')

In [2]:
t2s = OpenCC('t2s')
s2t = OpenCC('s2t')
nlp = StanfordCoreNLP('http://localhost:9000')
nlpw2v = word2vec.Word2Vec.load('/Users/Wan-Ting/Google Drive/NCTU/NLP/finalproj/others_work/word2vec/w2v.bin')

In [3]:
def is_time_stamp(l):
    if l[:2].isnumeric() and l[2] == ':':
        return True
    return False

def has_no_text(line):
    #移除分隔符號
    l = line.strip('\n')
    if not len(l):
        return True
    if l.isnumeric():
        return True
    if is_time_stamp(l):
        return True
    if l[0] == '(' and l[-1] == ')':
        return True
    return False

def is_lowercase_letter_or_comma(letter):
    if letter.isalpha() and letter.lower() == letter:
        return True
    if letter == ',':
        return True
    return False

def clean_up(lines):
    new_lines = []
    for line in lines[:]:
        #line = lines[6]
        if has_no_text(line):
            continue
        else:
          #append line
          new_lines.append(line)
    return new_lines    

In [35]:
with open('/Users/Wan-Ting/Google Drive/NCTU/NLP/finalproj/stopword.txt','r') as f:
    stopword = f.read()
stopword = stopword.split("\n")
stopword[:10]

[',', '?', '、', '。', '“', '”', '《', '》', '！', '，']

In [5]:
def filter_stopword(data):
    data_list=[]
    for word in data:
        if word not in stopword:
            data_list.append(word)
    return data_list

def time_transform(s):
    if s == '今天':
        s = '今日'
    if s == '明天':
        s = '明日'
    return s        

In [7]:
def read_data(scriptfile,root='/Users/Wan-Ting/Downloads/input'):
    s = '""‘“『』「」[]...*◎#＃\n— \t<i></i><I></I><B></B>{\an8}\{1c H0080FF}wwwgamedsc{i1}i{pos 190 200.104}﹒∮—-abcdefghijklmnopqrstuvwxyz'
    file_name = scriptfile
    with open(os.path.join(root,file_name), encoding='utf-8', errors='replace') as f:
        lines = f.readlines()
        new_lines = clean_up(lines)
    movie_lines = []
    for line in new_lines:
        movie_lines.append(line.translate(str.maketrans('','',s)))
    return movie_lines

In [9]:
def jieba_seg(scriptfile):
    lines = read_data(scriptfile,root='/Users/Wan-Ting/Downloads/input')
    seg_movie = []
    for i in lines:
        re = []
        words = jieba.cut(i, cut_all=False)
        for word in words:
            if word not in stopword:
                re.append(word)
        seg_movie.append(re)
    flat_list = [item for sublist in seg_movie for item in sublist]
    return flat_list

In [10]:
def extract_script(movie_lines):
    tstart = time.time()
    # annotation
    data = {}
    for i,line in enumerate(movie_lines):
        d = nlp.annotate(t2s.convert(line), properties={
          'annotators': 'tokenize,ssplit,pos,ner',
          'outputFormat': 'json'
          })
        data['%d'%i] = d
    
    # extract data
    information = []
    for i in range(len(data)):
        try:
            sentences = data['%d'%i]['sentences']
            tuple_list = []
            for j in range(len(sentences)):
                tokens = sentences[j]['tokens']
                for k in range(len(tokens)):
                    s_tuple = (tokens[k]['lemma'],tokens[k]['pos'],tokens[k]['ner'])
                    tuple_list.append(s_tuple)
            information.append(tuple_list)
        except:
            information.append('error')
    information = [x for x in information if x!='error']
    print('PARSING TIME :%.2f secs' %(time.time() - tstart))   
    return information

In [11]:
def change_similar(term,nlpw2v):
    try:
        similar_term = nlpw2v.most_similar(s2t.convert(term), topn=1)[0][0]
    except:
        similar_term = term
    return similar_term

In [12]:
def title_collocation(data):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(data)
    finder.apply_freq_filter(3)
    i = 0
    if len(finder.nbest(bigram_measures.likelihood_ratio, 10))!=0:
        while i<len(finder.nbest(bigram_measures.likelihood_ratio, 10)):
            tpl = finder.nbest(bigram_measures.likelihood_ratio, 10)[i]
            if tpl[0]==tpl[1]:
                i=i+1
            else:
                break
    else:
        tpl ='NO MATCH FOUND'
    title1=''.join(tpl)
    title1=s2t.convert(title1)
    #try:
    #    a1_t = change_similar(tpl[0],nlpw2v)
    #    a2_t = change_similar(tpl[1],nlpw2v)
    #    title2 = a1_t + a2_t
    #except:
    #    title2 = title1
    return title1

In [13]:
def title_frequency(information):
    # filter ner==person or pos not NNVVNT
    tuple_bag = [t for sent in information for t in sent]
    tuple_candidate = []
    for t in tuple_bag:
        if ((((t[0] not in stopword) and t[2]!='PERSON')  and t[1] in ['NN','VV','NT']) and len(t[0]) >= 2):
            tuple_candidate.append(t)
    tuple_candidate = (Counter(tuple_candidate)).most_common()
    
    # rule
    d = [tuple_candidate[0:5][i][0][2] for i in range(0,5)]
    l_t = len(tuple_candidate)
    d_all = [tuple_candidate[0:l_t][i] for i in range(0,l_t) if tuple_candidate[0:l_t][i][0][2] == 'DATE']
    if 'DATE' in d:
        a1 = tuple_candidate[d.index('DATE')][0][0]
        a1 = time_transform(a1)
        a2 = [x for x in tuple_candidate if x not in d_all][0][0][0]
    else:
        if tuple_candidate[0][0][1] == 'NN':
            a1 = tuple_candidate[1][0][0]
            a2 = tuple_candidate[0][0][0]
        else:
            n_all = [tuple_candidate[0:l_t][i] for i in range(0,l_t) if tuple_candidate[0:l_t][i][0][1] == 'NN']
            a1 = tuple_candidate[0][0][0]
            a2 = n_all[0][0][0]
    title1 = s2t.convert(a1+a2)
    return title1

In [14]:
def title_generator(scriptfile):
    data = jieba_seg(scriptfile)
    title = title_collocation(data)
    if title =='NO MATCH FOUND':
        lines = read_data(scriptfile,root='/Users/Wan-Ting/Downloads/input')
        information=extract_script(lines)
        print('method2:frequency')
        title = title_frequency(information)
    else:
        print('method1:collocation')
    return title

# mvls為想生成電影名稱的字幕檔

In [15]:
os.getcwd()

'/Users/Wan-Ting/Google Drive/NCTU/NLP/finalproj'

In [27]:
mvls = [str(i+1)+'.srt' for i in range(10)]

In [47]:
create = open ('/Users/Wan-Ting/Google Drive/NCTU/NLP/finalproj/task1_group5.txt','w')
#mvls = os.listdir('/Users/Wan-Ting/Downloads/input')
for scriptfile in mvls:
    try:
        title = title_generator(scriptfile)
        create.write(scriptfile.split('.srt')[0]+'\t'+title+'\n')
    except:
        print(scriptfile)
create.close()
print('done')

Building prefix dict from /Users/Wan-Ting/Google Drive/NCTU/NLP/finalproj/others_work/dict.txt.big1.txt ...
Dumping model to file cache /var/folders/q2/_z14v11n4hl9pw308x7_qs_80000gn/T/jieba.ud9d39d53952634e7a1e9fd07ad823249.cache
Loading model cost 3.523 seconds.
Prefix dict has been built succesfully.


method1:collocation
method1:collocation
method1:collocation
method1:collocation
method1:collocation
method1:collocation
method1:collocation
method1:collocation
method1:collocation
method1:collocation
done
