In [2]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Dropout
from keras.models import Sequential


In [3]:
# Read the train.csv into a DataFrame
data = pd.read_csv('data\Doceree-HCP_Train.csv', encoding='latin-1')

# Remove leading/trailing whitespaces from all string columns
data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Convert ID column to string
data['ID'] = data['ID'].astype(str)

# Remove rows with missing values in specific columns
data = data.dropna(subset=['DEVICETYPE', 'PLATFORM_ID'])

# Clean the BIDREQUESTIP column by removing invalid IP addresses
data['BIDREQUESTIP'] = data['BIDREQUESTIP'].astype(str).apply(lambda x: x if pd.Series(x).str.contains('^(\d{1,3}\.){3}\d{1,3}$').all() else None)

# Clean the USERZIPCODE column by removing non-numeric characters
data['USERZIPCODE'] = data['USERZIPCODE'].astype(str).str.replace(r'\D+', '')

# Split the USERAGENT column into separate browser and operating system columns
data[['BROWSER', 'OS']] = data['USERAGENT'].astype(str).str.split('(', n=1, expand=True)
data['BROWSER'] = data['BROWSER'].astype(str).str.strip()
data['OS'] = data['OS'].astype(str).str.rstrip(')')

# Remove duplicates from KEYWORDS column
data['KEYWORDS'] = data['KEYWORDS'].apply(lambda x: '|'.join(set(x.split('|'))) if pd.notnull(x) else x)

# Fill missing values in IS_HCP column with 0
data['IS_HCP'].fillna(0, inplace=True)

# Convert IS_HCP column to integer
data['IS_HCP'] = data['IS_HCP'].astype(int)

# Print the cleaned data
print(data.head(3))


  data['BIDREQUESTIP'] = data['BIDREQUESTIP'].astype(str).apply(lambda x: x if pd.Series(x).str.contains('^(\d{1,3}\.){3}\d{1,3}$').all() else None)
  data['USERZIPCODE'] = data['USERZIPCODE'].astype(str).str.replace(r'\D+', '')


     ID DEVICETYPE  PLATFORM_ID   BIDREQUESTIP  \
0  1001    Desktop            2   170.173.0.22   
1  1002    Desktop            2  65.216.253.25   
2  1003    Desktop            2   66.232.79.22   

                        USERPLATFORMUID     USERCITY USERZIPCODE  \
0  6974dcaa-f932-480e-9fb5-c52e20e1393a     Portland      972060   
1  c12f3f8f-8fcf-484a-90e1-1ac04db8cdcf    Arlington      222020   
2  a698de4b-e200-46dd-b5fb-40402175ae18  New Meadows      836540   

                                           USERAGENT            PLATFORMTYPE  \
0  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...  Online Medical Journal   
1  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...  Online Medical Journal   
2  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...  Online Medical Journal   

  CHANNELTYPE                                                URL  \
0     Website  https://www.cancertherapyadvisor.com/home/canc...   
1     Website  https://www.cancertherapyadvisor.com/home/deci...   
2

In [4]:
data.dtypes.count()

16

In [8]:
len(data['TAXONOMY'].unique())

208

In [71]:
# Read the data from CSV file
test_data = pd.read_csv('data\Doceree-HCP-Test.csv', encoding='latin-1')

# Remove leading/trailing whitespaces from all string columns
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Convert ID column to string
test_data['ID'] = test_data['ID'].astype(str)

# Remove rows with missing values in specific columns
test_data = test_data.dropna(subset=['DEVICETYPE', 'PLATFORM_ID'])

# Clean the BIDREQUESTIP column by removing invalid IP addresses
test_data['BIDREQUESTIP'] = test_data['BIDREQUESTIP'].astype(str).apply(lambda x: x if pd.Series(x).str.contains('^(\d{1,3}\.){3}\d{1,3}$').all() else None)

# Clean the USERZIPCODE column by removing non-numeric characters
test_data['USERZIPCODE'] = test_data['USERZIPCODE'].astype(str).str.replace(r'\D+', '')

# Split the USERAGENT column into separate browser and operating system columns
test_data[['BROWSER', 'OS']] = test_data['USERAGENT'].astype(str).str.split('(', n=1, expand=True)
test_data['BROWSER'] = test_data['BROWSER'].astype(str).str.strip()
test_data['OS'] = test_data['OS'].astype(str).str.rstrip(')')

# Remove duplicates from KEYWORDS column
test_data['KEYWORDS'] = test_data['KEYWORDS'].apply(lambda x: '|'.join(set(x.split('|'))) if pd.notnull(x) else x)

# Print the cleaned data
print(test_data.head(3))


  test_data['BIDREQUESTIP'] = test_data['BIDREQUESTIP'].astype(str).apply(lambda x: x if pd.Series(x).str.contains('^(\d{1,3}\.){3}\d{1,3}$').all() else None)
  test_data['USERZIPCODE'] = test_data['USERZIPCODE'].astype(str).str.replace(r'\D+', '')


       ID DEVICETYPE  PLATFORM_ID     BIDREQUESTIP  \
0  115501    Desktop          2.0   75.189.231.103   
1  115502     Mobile          2.0    24.101.33.158   
2  115503    Desktop          2.0  172.118.216.142   

                        USERPLATFORMUID       USERCITY USERZIPCODE  \
0  0d5041ff-f0b6-4d1a-9ad7-0a29f7d485b4   Fayetteville      283050   
1  c8396dd0-969f-4d99-a40b-b7bb1f516154  Conneaut Lake      163160   
2  3c97a081-6518-43f8-9f26-369759cfb471         Covina      917240   

                                           USERAGENT            PLATFORMTYPE  \
0  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6...  Online Medical Journal   
1  Mozilla/5.0 (iPhone; CPU iPhone OS 15_6_1 like...  Online Medical Journal   
2  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...  Online Medical Journal   

  CHANNELTYPE                                                URL  \
0     Website  https://www.clinicaladvisor.com/home/features/...   
1     Website  https://www.ophthalmologyadvis

In [72]:
test_data.dtypes.count()

14

In [76]:
test_data.dtypes.describe

<bound method NDFrame.describe of ID                  object
DEVICETYPE          object
PLATFORM_ID        float64
BIDREQUESTIP        object
USERPLATFORMUID     object
USERCITY            object
USERZIPCODE         object
USERAGENT           object
PLATFORMTYPE        object
CHANNELTYPE         object
URL                 object
KEYWORDS            object
BROWSER             object
OS                  object
dtype: object>

In [77]:
# encode all the columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test_data = test_data.apply(le.fit_transform)

In [78]:
test_data.head(3)

Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS,BROWSER,OS
0,0,0,1,12371,935,858,1621,1230,3,0,545,372,4,1236
1,1,1,1,6273,14094,560,614,2119,3,0,2279,419,4,2127
2,2,0,1,2996,4194,593,6202,1328,3,0,2305,78,4,1334


In [None]:
test_data.dtypes.count()

In [64]:
# Divide the data target variable and features
X = data.drop(['IS_HCP'], axis=1)
y = data['IS_HCP']

In [65]:
# encode all the columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = X.apply(le.fit_transform)

In [66]:
X.dtypes.count()

15

In [67]:
X.head()

Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS,TAXONOMY,BROWSER,OS
0,110,0,1,6580,22183,3177,10914,2857,3,0,666,743,207,10,2857
1,221,0,1,19253,40576,130,1901,2672,3,0,708,412,207,10,2671
2,332,0,1,19704,34968,2701,9157,2672,3,0,666,743,207,10,2671
3,443,0,2,4193,14676,4420,1960,2945,3,0,327,187,149,10,2945
4,554,1,6,10056,33844,1767,8360,4049,3,0,1631,288,207,10,4057


In [68]:
X.dtypes.count()

15

In [81]:
# model
model = Sequential()
model.add(Dense(64, input_dim=15, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
history = model.fit(X, y, epochs=10, batch_size=10)

# Evaluate the model
loss, accuracy = model.evaluate(X, y)

print('Accuracy: %.2f' % (accuracy*100))

# Predict the target variable
# y_pred = model.predict(test_data)

# Save the model
model.save('model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 96.26
