# Predicting a Pulsar Star

In [6]:
import pandas as pd

In [2]:
df = pd.read_csv('pulsar_stars.csv')

In [3]:
df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 Mean of the integrated profile                  17898 non-null float64
 Standard deviation of the integrated profile    17898 non-null float64
 Excess kurtosis of the integrated profile       17898 non-null float64
 Skewness of the integrated profile              17898 non-null float64
 Mean of the DM-SNR curve                        17898 non-null float64
 Standard deviation of the DM-SNR curve          17898 non-null float64
 Excess kurtosis of the DM-SNR curve             17898 non-null float64
 Skewness of the DM-SNR curve                    17898 non-null float64
target_class                                     17898 non-null int64
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


# Scale the Data

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()

In [9]:
scaled_features = scaler.fit_transform(df.drop('target_class', axis=1))
df_features = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_features.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve
0,1.149317,1.334832,-0.66957,-0.400459,-0.31944,-0.370625,-0.072798,-0.287438
1,-0.334168,1.802265,-0.011785,-0.370535,-0.371102,-0.588924,0.504427,0.211581
2,-0.314372,-1.053322,-0.145233,-0.116593,-0.322107,-0.235328,-0.125996,-0.391373
3,1.000694,1.553254,-0.513409,-0.390178,-0.304404,-0.275666,-0.312265,-0.4813
4,-0.871402,-0.858879,0.115609,-0.104866,-0.38801,-0.763111,1.324026,1.386794


# Train the Data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = df_features
y = df['target_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [13]:
reg = LogisticRegression()
rfc = RandomForestClassifier()

In [14]:
reg.fit(X_train,y_train)
rfc.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
reg_pred = reg.predict(X_test)
rfc_pred = rfc.predict(X_test)

# Evaluate the Model

In [16]:
from sklearn.metrics import confusion_matrix, classification_report

In [18]:
print(confusion_matrix(y_test, reg_pred))
print('\n')
print(classification_report(y_test, reg_pred))

[[5352   29]
 [  99  427]]


              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5381
           1       0.94      0.81      0.87       526

    accuracy                           0.98      5907
   macro avg       0.96      0.90      0.93      5907
weighted avg       0.98      0.98      0.98      5907



In [19]:
print(confusion_matrix(y_test, rfc_pred))
print('\n')
print(classification_report(y_test, rfc_pred))

[[5346   35]
 [  94  432]]


              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5381
           1       0.93      0.82      0.87       526

    accuracy                           0.98      5907
   macro avg       0.95      0.91      0.93      5907
weighted avg       0.98      0.98      0.98      5907



# Conclusion

It looks like both the Logistic Regression & Random Forrest Classifier performed well and were similar in results.