## Student ID: 113062594, name: 陳力瑋

In [1]:
import pandas as pd

df = pd.read_csv("train.csv")
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


## How did you preprocess data
- get title,auther_name,year,month,day,hour,weekday,see_also,see_also_num,contents,content_len 
- delete stop words in topic.
- stem, and lemmatize degrade performance

In [11]:
import re
from bs4 import BeautifulSoup
from datetime import datetime


def preprocessor(text):
    soup = BeautifulSoup(text, "html.parser")

    title = soup.body.h1.get_text(strip=True).lower()

    topic = " ".join(
        [
            category.get_text(strip=True).replace(" ", "-").lower()
            for category in soup.body.select(".article-topics > a")
        ]
    )

    author_name = soup.head.select_one(".author_name > a")
    if author_name is None:
        author_name = soup.head.select_one(".author_name")
    if author_name is  None:
        author_name = soup.head.select_one(".article-info > span")
    if author_name is  None:
        author_name = soup.head.select_one(".article-info > a")
    author_name = author_name.get_text(strip=True).replace(" ", "-").lower()
    author_name = author_name.replace("by-", "")
    author_name = author_name.replace(",-", " ")
    author_name = author_name.replace("-and-", " ")
    author_name = author_name.replace("-&-", " ")
    author_name = author_name.replace("- ", " ")
    author_name = author_name.replace("--", " ")

    date = soup.head.select_one("time")
    year = 0
    month = 0
    day = 0
    hour = 25
    weekday = 0
    if date is not None and date.get_text(strip=True) != "":
        date = datetime.strptime(date.get_text(strip=True).replace("UTC", "+0000"), "%Y-%m-%d %H:%M:%S %z")
        year = date.year
        month = date.month
        day = date.day
        hour = date.hour
        weekday = date.isoweekday()

    # see_also and see_also_num no help
    see_also = soup.body.select(".see-also > a")
    see_also_num = 0
    if see_also:
        see_also_num = len(see_also)
        see_also = " ".join(
            [
                see_also.get_text(strip=True).lower()
                for see_also in see_also
            ]
        )
    else:
        see_also = ""

    # content no help
    contents = " ".join(
        [
            article_content.get_text(strip=True).lower()
            for article_content in soup.body.select(".article-content")
        ]
    )
    content_len = len(contents)


    return title, topic, author_name, year, month, day, hour, content_len,see_also, see_also_num, contents, weekday


In [3]:
print(preprocessor(df.loc[0, "Page content"]))

("nasa's grand challenge: stop asteroids from destroying earth", 'asteroid asteroids challenge earth space u.s. world', 'clara-moskowitz', 2013, 6, 19, 15, 3564, '', 0, 'there may be killer asteroids headed for earth, and nasa has decided to do something about it. the space agency announced a new "grand challenge" on june 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet.the new mission builds on projects already underway at nasa, including a plan tocapture an asteroid, pull it in toward the moon and send astronauts to visit it. as part of the grand challenge, the agency issued a "request for information" today — aiming to solicit ideas from industry, academia and the public on how to improve the asteroid mission plan."we\'re asking for you to think about concepts and different approaches for what we\'ve described here," william gerstenmaier, nasa\'s associate administrator for human explorations and operations, said yesterday during a nasa

In [4]:
features = [preprocessor(page_content) for page_content in df["Page content"]]
df_processed = pd.DataFrame(
    features,
    columns=[
        "title",
        "topic",
        "author_name",
        "year",
        "month",
        "day",
        "hour",
        "content_len",
        "see_also",
        "see_also_num",
        "contents",
        "weekday",
    ],
)

In [5]:
def tokenizer(text):
    return re.split("\s+", text[0].strip())


print(tokenizer("runners like running and thus they run"))

['r']


In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop = stopwords.words("english")


def tokenizer_nostop(text):
    return [
        w
        for w in re.split("\s+", text[0].strip())
        if w not in stop and re.match("[a-zA-Z]+", w)
    ]


print(tokenizer_nostop("runners like running and thus they run"))

['r']


[nltk_data] Downloading package stopwords to /home/wayne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.model_selection import train_test_split

X_train = df_processed.values
y_train = (df["Popularity"].values == 1).astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=0
)

- I found only title, topic, year, month, day, hour, content_len, weekday can imporve performance
- LGBMClassifier better than RandomForestClassifier

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer


transformer = ColumnTransformer(
    [
        ("title", CountVectorizer(tokenizer=tokenizer, lowercase=False), [0]),
        ("topic", CountVectorizer(tokenizer=tokenizer_nostop, lowercase=False), [1]),
        ("author_name", "drop", [2]),
        # ("year", "drop", [3]),
        # ("month", "drop", [4]),
        # ("day", "drop", [5]),
        # ("hour", "drop", [6]),
        # ("content_len", "drop", [7]),
        ("see_also", "drop", [8]),
        ("see_also_num", "drop", [9]),
        ("contents", "drop", [10]),
        # ("weekday", "drop", [11]),
    ],
    n_jobs=-1,
    remainder="passthrough",
)

X_train = transformer.fit_transform(X_train)
X_valid = transformer.transform(X_valid)



In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


forest = RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators=400)


forest.fit(X_train, y_train)
print(
    "train score: {:.5f}".format(
        roc_auc_score(y_train, forest.predict_proba(X_train)[:, 1])
    )
)
print(
    "valid score: {:.5f}".format(
        roc_auc_score(y_valid, forest.predict_proba(X_valid)[:, 1])
    )
)

train score: 1.00000
valid score: 0.58336


In [10]:
from lightgbm import LGBMClassifier

lgbm_classifier = LGBMClassifier(random_state=0, learning_rate=0.012, n_estimators=190)
lgbm_classifier.fit(X_train, y_train)

print(
    "train score: {:.5f}".format(
        roc_auc_score(y_train, lgbm_classifier.predict_proba(X_train)[:, 1])
    )
)
print(
    "valid score: {:.5f}".format(
        roc_auc_score(y_valid, lgbm_classifier.predict_proba(X_valid)[:, 1])
    )
)

[LightGBM] [Info] Number of positive: 10885, number of negative: 11229
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4507
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 1981
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492222 -> initscore=-0.031114
[LightGBM] [Info] Start training from score -0.031114
train score: 0.67597
valid score: 0.59496


In [12]:
test_features = [
    preprocessor(page_content)
    for page_content in pd.read_csv("test.csv")["Page content"]
]
df_test = pd.DataFrame(
    test_features,
    columns=[
        "title",
        "topic",
        "author_name",
        "year",
        "month",
        "day",
        "hour",
        "content_len",
        "see_also",
        "see_also_num",
        "contents",
        "weekday",
    ],
)

X_test = transformer.transform(df_test.values)
y_pred = lgbm_classifier.predict_proba(X_test)
y_pred


array([[0.56084331, 0.43915669],
       [0.57333006, 0.42666994],
       [0.56516231, 0.43483769],
       ...,
       [0.50313453, 0.49686547],
       [0.5599558 , 0.4400442 ],
       [0.69606248, 0.30393752]])