引入所需库

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from nltk.corpus import stopwords
from tqdm import tqdm
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
import numpy as np
from numpy import reshape
from sentence_transformers import SentenceTransformer
import re
import pickle


In [2]:
# load data
df1 = pd.read_csv('Webpages_Classification_train_data.csv').drop(['Unnamed: 0'], axis=1)
df2 = pd.read_csv('Webpages_Classification_test_data.csv').drop(['Unnamed: 0'], axis=1)
    
concatenated_df = pd.concat([df1, df2])

In [3]:
concatenated_df.head()

Unnamed: 0,url,url_len,ip_add,geo_loc,tld,who_is,https,js_len,js_obf_len,content,label
0,http://members.tripod.com/russiastation/,40,42.77.221.155,Taiwan,com,complete,yes,58.0,0.0,Named themselves charged particles in a manly ...,good
1,http://www.ddj.com/cpp/184403822,32,3.211.202.180,United States,com,complete,yes,52.5,0.0,And filipino field \n \n \n \n \n \n \n \n the...,good
2,http://www.naef-usa.com/,24,24.232.54.41,Argentina,com,complete,yes,103.5,0.0,"Took in cognitivism, whose adherents argue for...",good
3,http://www.ff-b2b.de/,21,147.22.38.45,United States,de,incomplete,no,720.0,532.8,fire cumshot sodomize footaction tortur failed...,bad
4,http://us.imdb.com/title/tt0176269/,35,205.30.239.85,United States,com,complete,yes,46.5,0.0,"Levant, also monsignor georges. In 1800, lists...",good


In [None]:
concatenated_df.label.describe()

In [None]:
concatenated_df.dtypes

In [None]:
concatenated_df.label.describe()

In [None]:
# Get an equally distributed sample
concatenated_df_good_equally_bad = concatenated_df.groupby('label').apply(lambda x: x.sample(30000, random_state=42)).reset_index(drop=True)
# Remove if content has less than 60 words
concatenated_df_good_equally_bad = concatenated_df_good_equally_bad[concatenated_df_good_equally_bad.content.str.split().str.len().ge(60)]
concatenated_df_good_equally_bad.label.describe()

In [None]:
concatenated_df_good_equally_bad.head()

In [None]:
# Resample trimmed dataframe to make it uniformly distributed
re_concatenated_df_good_equally_bad = concatenated_df_good_equally_bad.groupby('label').apply(lambda x: x.sample(3000, random_state=42)).reset_index(drop=True)
# Randomly shuffle rows for aesthetics
re_concatenated_df_good_equally_bad = re_concatenated_df_good_equally_bad.sample(frac=1, random_state=42).reset_index(drop=True)
re_concatenated_df_good_equally_bad.label.describe()

In [None]:
re_concatenated_df_good_equally_bad.head()

In [None]:
re_concatenated_df_good_equally_bad[['geo_loc', 'tld','who_is','https', 'label']].describe()

In [None]:
re_concatenated_df_good_equally_bad['geo_loc'] = OrdinalEncoder().fit_transform(re_concatenated_df_good_equally_bad.geo_loc.values.reshape(-1,1))
re_concatenated_df_good_equally_bad['tld'] = OrdinalEncoder().fit_transform(re_concatenated_df_good_equally_bad.tld.values.reshape(-1,1))
re_concatenated_df_good_equally_bad['who_is'] = OrdinalEncoder().fit_transform(re_concatenated_df_good_equally_bad.who_is.values.reshape(-1,1))
re_concatenated_df_good_equally_bad['https'] = OrdinalEncoder().fit_transform(re_concatenated_df_good_equally_bad.https.values.reshape(-1,1))
re_concatenated_df_good_equally_bad['label'] = OrdinalEncoder().fit_transform(re_concatenated_df_good_equally_bad.label.values.reshape(-1,1))

In [None]:
re_concatenated_df_good_equally_bad.head()

# convert url into human readable string that can be tokenized
re_concatenated_df_good_equally_bad['url'] = re_concatenated_df_good_equally_bad.url.apply(lambda x: ' '.join(x.split('://')[1].strip('www.').replace('.','/').split('/')))
re_concatenated_df_good_equally_bad.head()

print("Before Preprocessing:")
print(re_concatenated_df_good_equally_bad.content.head())

tqdm.pandas()
stop = stopwords.words()

re_concatenated_df_good_equally_bad.content = re_concatenated_df_good_equally_bad.content.str.replace("[^\w\s]", "").str.lower()
re_concatenated_df_good_equally_bad.content = re_concatenated_df_good_equally_bad.content.progress_apply(lambda x: ' '.join([item for item in x.split() 
                                                               if item not in stop]))
re_concatenated_df_good_equally_bad.url = re_concatenated_df_good_equally_bad.url.str.replace("[^\w\s]", "").str.lower()
re_concatenated_df_good_equally_bad.url = re_concatenated_df_good_equally_bad.url.progress_apply(lambda x: ' '.join([item for item in x.split() 
                                                               if item not in stop]))

print("After Preprocessing:")
print(re_concatenated_df_good_equally_bad.content.head())

tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)

tfidf.fit(re_concatenated_df_good_equally_bad.url)
url_tfidf = tfidf.transform(re_concatenated_df_good_equally_bad.url)

tfidf.fit(re_concatenated_df_good_equally_bad.content)
content_tfidf = tfidf.transform(re_concatenated_df_good_equally_bad.content)

re_concatenated_df_good_equally_bad.head()

def find_optimal_clusters(data, max_k):
    k_list = range(2, max_k+1)
    
    sse = []
    for k in k_list:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
       
    plt.style.use("dark_background")
    f, ax = plt.subplots(1, 1)
    ax.plot(k_list, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(k_list)
    ax.set_xticklabels(k_list)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')

find_optimal_clusters(url_tfidf, 20)

re_concatenated_df_good_equally_bad['url_cluster'] = MiniBatchKMeans(n_clusters=9, init_size=1024, batch_size=2048, 
                                            random_state=20).fit_predict(url_tfidf)

find_optimal_clusters(content_tfidf, 20)

re_concatenated_df_good_equally_bad['content_cluster'] = MiniBatchKMeans(n_clusters=5, init_size=1024, batch_size=2048, 
                                            random_state=20).fit_predict(content_tfidf)

re_concatenated_df_good_equally_bad.head()

In [None]:
SentenceTransformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

re_concatenated_df_good_equally_bad['url'] = SentenceTransformer_model.encode(re_concatenated_df_good_equally_bad['url'])

In [None]:
re_concatenated_df_good_equally_bad['url'] = re_concatenated_df_good_equally_bad.url.apply(lambda x:SentenceTransformer_model.encode(x))

In [None]:
re_concatenated_df_good_equally_bad.head()

re_concatenated_df_good_equally_bad['content'] = SentenceTransformer_model.encode(re_concatenated_df_good_equally_bad['content'])

In [None]:
re_concatenated_df_good_equally_bad['content'] = re_concatenated_df_good_equally_bad.content.apply(lambda x:SentenceTransformer_model.encode(x))

In [None]:
re_concatenated_df_good_equally_bad.head()

In [None]:
re_concatenated_df_good_equally_bad.url.values

In [None]:
# 分离特征和标签

X = re_concatenated_df_good_equally_bad.url
#X = re_concatenated_df_good_equally_bad[['url_cluster', 'url_len', 'geo_loc', 'tld', 'who_is', 'https', 'content_cluster',
                #'js_len', 'js_obf_len']]
y = re_concatenated_df_good_equally_bad.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
X_train[0].shape

In [None]:
y_train

In [None]:
X_train.shape

In [None]:
[i for i in X_train]

In [None]:
# 使用LogisticRegression模型
model = LogisticRegression(max_iter=5000)
params = {
    'C': [0.1, 0.5, 1.0],
    'solver': ['lbfgs', 'liblinear']
}
grid = GridSearchCV(model, params, cv=5)


grid.fit([i for i in X_train], y_train)

In [None]:
print("Best parameters for LogisticRegression: ", grid.best_params_)
print("Best score for LogisticRegression: ", grid.best_score_)

In [None]:
#使用LogisticRegression进行训练
best_C = 1.0
best_solver = 'lbfgs'
model = LogisticRegression(C=best_C, solver=best_solver)
model.fit([i for i in X_train], y_train)

In [None]:
y_pred_train = model.predict([i for i in X_train])
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Accuracy: ", accuracy_train)

In [None]:
y_pred_test = model.predict([i for i in X_test])
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Accuracy: ", accuracy_test)

In [None]:
def url_check(url):
    embedding = SentenceTransformer_model.encode(url)
    embedding = embedding.reshape(1,-1)
    prediction = model.predict(embedding)
    dic = { 1:'good', 0:'bad'}
    print("Prediction for new URL: ", dic[prediction[0]])

In [None]:
url_check("http://www.cheernudes.com/lesbian/1024lss/")

In [None]:
url_check("http://www.example.com")

In [None]:

#
# Create your model here (same as above)
#
# Save to file in the current working directory


In [None]:
# Save to file in the current working directory
pkl_filename = "url_check_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)


In [None]:
# Load from file
pkl_filename = "url_check_model.pkl"
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [None]:
y_pred_train = pickle_model.predict([i for i in X_train])
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Accuracy: ", accuracy_train)

In [None]:
y_pred_test = pickle_model.predict([i for i in X_test])
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Accuracy: ", accuracy_test)