In [1]:
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import numpy as np
import re

tokenizer = Tokenizer()

In [2]:
def vowels_of(kana):
    table = {
        '„Ç¢':'a','„Ç§':'i','„Ç¶':'u','„Ç®':'e','„Ç™':'o',
        '„É£':'a','„É•':'u','„Éß':'o',
        '„Ç´':'a','„Ç≠':'i','„ÇØ':'u','„Ç±':'e','„Ç≥':'o',
        '„Çµ':'a','„Ç∑':'i','„Çπ':'u','„Çª':'e','„ÇΩ':'o',
        '„Çø':'a','„ÉÅ':'i','„ÉÑ':'u','„ÉÜ':'e','„Éà':'o',
        '„Éä':'a','„Éã':'i','„Éå':'u','„Éç':'e','„Éé':'o',
        '„Éè':'a','„Éí':'i','„Éï':'u','„Éò':'e','„Éõ':'o',
        '„Éû':'a','„Éü':'i','„É†':'u','„É°':'e','„É¢':'o',
        '„É§':'a','„É¶':'u','„É®':'o',
        '„É©':'a','„É™':'i','„É´':'u','„É¨':'e','„É≠':'o',
        '„ÉØ':'a','„É≤':'o','„É≥':'n',
        '„Ç¨':'a','„ÇÆ':'i','„Ç∞':'u','„Ç≤':'e','„Ç¥':'o',
        '„Ç∂':'a','„Ç∏':'i','„Ç∫':'u','„Çº':'e','„Çæ':'o',
        '„ÉÄ':'a','„ÉÇ':'i','„ÉÖ':'u','„Éá':'e','„Éâ':'o',
        '„Éê':'a','„Éì':'i','„Éñ':'u','„Éô':'e','„Éú':'o',
        '„Éë':'a','„Éî':'i','„Éó':'u','„Éö':'e','„Éù':'o',
        '„Éº':'', '„ÉÉ':''
    }
    return "".join(table.get(c, "") for c in kana)

In [3]:
def consonants_of(kana):
    cons_map = {
        '„Ç¢':'','„Ç§':'','„Ç¶':'','„Ç®':'','„Ç™':'',
        '„Ç´':'k','„Ç≠':'k','„ÇØ':'k','„Ç±':'k','„Ç≥':'k',
        '„Çµ':'s','„Ç∑':'s','„Çπ':'s','„Çª':'s','„ÇΩ':'s',
        '„Çø':'t','„ÉÅ':'t','„ÉÑ':'t','„ÉÜ':'t','„Éà':'t',
        '„Éä':'n','„Éã':'n','„Éå':'n','„Éç':'n','„Éé':'n',
        '„Éè':'h','„Éí':'h','„Éï':'h','„Éò':'h','„Éõ':'h',
        '„Éû':'m','„Éü':'m','„É†':'m','„É°':'m','„É¢':'m',
        '„É§':'y','„É¶':'y','„É®':'y',
        '„É©':'r','„É™':'r','„É´':'r','„É¨':'r','„É≠':'r',
        '„ÉØ':'w','„É≤':'w','„É≥':'n',
        '„Ç¨':'g','„ÇÆ':'g','„Ç∞':'g','„Ç≤':'g','„Ç¥':'g',
        '„Ç∂':'z','„Ç∏':'z','„Ç∫':'z','„Çº':'z','„Çæ':'z',
        '„ÉÄ':'d','„ÉÇ':'d','„ÉÖ':'d','„Éá':'d','„Éâ':'d',
        '„Éê':'b','„Éì':'b','„Éñ':'b','„Éô':'b','„Éú':'b',
        '„Éë':'p','„Éî':'p','„Éó':'p','„Éö':'p','„Éù':'p',
        '„Éº':'','„ÉÉ':'t',
    }
    return "".join(cons_map.get(c, "") for c in kana)

In [4]:
def levenshtein(a, b):
    if not a or not b: return max(len(a), len(b))
    dp = [[i + j if i * j == 0 else 0 for j in range(len(b) + 1)] for i in range(len(a) + 1)]
    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
            dp[i][j] = min(
                dp[i-1][j] + 1,
                dp[i][j-1] + 1,
                dp[i-1][j-1] + (0 if a[i-1] == b[j-1] else 1)
            )
    return dp[-1][-1]

def similarity(a, b):
    if not a or not b: return 0
    dist = levenshtein(a, b)
    return 1 - dist / max(len(a), len(b))

In [5]:
def extract_features(sentence):
    tokens = list(tokenizer.tokenize(sentence))
    readings = [t.reading for t in tokens if t.reading != "*"]

    features = {}

    # Ë™ûÊï∞
    features["word_count"] = len(tokens)

    # Êú´Â∞æ2Ë™û
    if len(readings) >= 2:
        r1 = readings[-1]
        r2 = readings[-2]

        # Ë™≠„ÅøÊú´Â∞æÈ°û‰ºº
        features["end_read_sim4"] = similarity(r1[-4:], r2[-4:])

        # ÊØçÈü≥ÂàóÈ°û‰ºº
        v1 = vowels_of(r1)
        v2 = vowels_of(r2)
        features["vowel_sim"] = similarity(v1[-4:], v2[-4:])

        # Â≠êÈü≥ÂàóÈ°û‰ºº
        c1 = consonants_of(r1)
        c2 = consonants_of(r2)
        features["cons_sim"] = similarity(c1[-4:], c2[-4:])
    else:
        features["end_read_sim4"] = 0
        features["vowel_sim"] = 0
        features["cons_sim"] = 0

    # Ë™≠„ÅøÂÖ®‰Ωì„ÅÆÈï∑„ÅïÂπ≥Âùá
    features["avg_reading_len"] = np.mean([len(r) for r in readings]) if readings else 0

    return features

In [6]:
jokes = [
    "Â∏ÉÂõ£„Åå„Åµ„Å£„Å®„Çì„Å†",
    "„Ç§„ÇØ„É©„ÅØ„ÅÑ„Åè„ÇâÔºü",
    "„Ç´„É¨„Éº„ÅØ„Åã„Çå„Éº",
    "„Éà„Ç§„É¨„Å´„ÅÑ„Å£„Å®„ÅÑ„Çå",
    "„Ç´„Ç®„É´„ÅåÂ∏∞„Çã",
    "„Éç„Ç≥„Åå„Å≠„Åì„Çç„Çì„Å†",
    "„É©„ÇØ„ÉÄ„Å´‰πó„Çã„Å®„Çâ„Åè„Å†",
    "„Ç´„ÉÉ„Çø„Éº„ÇíË≤∑„Å£„Åü„ÄÇÂàá„Çå„Å™„Åã„Å£„Åü„Éº",
    "„Ç≥„Éº„É©„Çí„Åì„Åä„Çâ„Åõ„Çã",
    "„ÉÅ„Éº„Çø„ÅåËêΩ„Å£„Åì„Å°„Éº„Åü",
    "„Çπ„Ç≠„Éº„ÅåÂ•Ω„Åç",
    "„Ç§„É´„Ç´„ÅØ„ÅÑ„Çã„ÅãÔºü",
    "„Éë„É≥„ÉÄ„ÅÆ„Éë„É≥„Å†",
    "ÁÆ±„Çí„ÅØ„Åì„Å∂",
    "Ëô´„ÅØ„ÇÄ„Åó",
    "Ê†ó„ÅÆ„ÇØ„É™„Éº„É†",
    "Âêõ„ÅØÈªÑË∫´„ÅåÂ•Ω„ÅçÔºü",
    "„ÇΩ„Éº„ÉÄ„ÅØ„ÅÜ„Åæ„Åù„ÅÜ„Å†",
    "„Ç¢„Ç§„Çπ„ÇíÊÑõ„Åô",
    "„ÉÄ„Ç∏„É£„É¨„ÇíË®Ä„Å£„Åü„ÅÆ„ÅØË™∞„Åò„ÇÉ",
    "ÈõªË©±„Å´„ÄÅ„Åß„Çì„Çè",
    "„Éà„Éû„Éà„ÅåÊ≠¢„Åæ„Å£„Å®„Çã",
    "„Çø„É¨„Åå„Åü„Çå„Åü",
    "„Çπ„Ç§„Ç´„ÅØ„ÇÑ„Åô„ÅÑ„ÅãÔºü",
    "ÁÑºËÇâ„ÅØÁÑº„Åç„Å´„Åè„ÅÑ",
    "Âªä‰∏ã„Å´„Åô„Çè„Çç„ÅÜ„Åã",
    "„Åì„ÅÆ„Çø„Ç§„É§Âõ∫„ÅÑ„ÇÑ",
    "ÂÜÖËáì„Åå„Å™„ÅÑ„Åû„ÅÜ",
    "„Éõ„ÉÉ„Éà„Ç±„Éº„Ç≠„ÅØ„Åª„Å£„Å®„Åë„Éº",
    "ÈºªÊØõ„ÅØ„Å™„Åí„Éº",
    "„ÉØ„Éã„ÅåËº™„Å´„Å™„Å£„Åü",
    "Ëçâ„Åå„Åè„Åï„ÅÑ"
]

labels = [
    2,2,1,2,2,1,2,2,1,2,1,2,1,2,2,2,2,2,2,1,
    1,2,1,2,1,2,1,1,2,1,  # 1„Äú30
    1,  # 31: „ÉØ„Éã„ÅåËº™„Å´„Å™„Å£„Åü
    2   # 32: Ëçâ„Åå„Åè„Åï„ÅÑ
]

In [7]:
X = [extract_features(j) for j in jokes]
vec = DictVectorizer(sparse=False)
Xvec = vec.fit_transform(X)

model = LinearRegression()
model.fit(Xvec, labels)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
model.fit(Xvec, labels)
def judge(joke):
    f = extract_features(joke)
    Xtest = vec.transform([f])
    return model.predict(Xtest)[0]

def judge_text(score):
    if score >= 1.5:
        return "ü§£ Èù¢ÁôΩ„ÅÑÔºÅ"
    else:
        return "üòê Èù¢ÁôΩ„Åè„Å™„ÅÑ„Åã„ÇÇ‚Ä¶"

# „ÉÜ„Çπ„Éà
joke = "„ÅÇ„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè"
score = judge(joke)
print(joke, "‚Üí", score, judge_text(score))

„ÅÇ„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè„Çè ‚Üí 2.2485067205381046 ü§£ Èù¢ÁôΩ„ÅÑÔºÅ
