In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# data visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

In [3]:
df = pd.read_csv("C://Data/all.csv")

In [4]:
df.drop(['ph_PostMigratedHere','ph_UnknownSuggestionEvent','ph_QuestionMerged','ph_PostMigrated','ph_PostMigrated','p_Wiki','ph_QuestionUnmerged','ph_PostDisassociated','ph_SuggestedEditApplied','ph_UnknownDevRelatedEvent','ph_VoteNullificationByDev','ph_PostTweeted','ph_PostUnmigrated','ph_UnknownModeratorEvent','ph_UnknownEvent','ph_CommentDiscussionMovedToChat','p_PrivilegeWiki','p_WikiPlaceholder'], axis= 1, inplace=True);

In [5]:
# Create training and testing sets
df_train,df_test= train_test_split(df, test_size = 0.2, random_state = 0)
combine = [df_train,df_test]

In [6]:
print(df_train.columns.values)

['Id' 'country' 'AboutMe_length' 'activity_in_months' 'UpVotes'
 'DownVotes' 'Reputation' 'Views' 'badges' 'Q_comments' 'A_comments'
 'P_questions' 'P_answers' 'p_ModeratorNomination' 'p_TagWiki'
 'p_TagWikiExerpt' 'ph_InitialTitle' 'ph_EditTitle' 'ph_InitialBody'
 'ph_InitialTags' 'ph_EditBody' 'ph_EditTags' 'ph_RollbackTitle'
 'ph_RollbackBody' 'ph_PostReopened' 'ph_RollbackTags' 'ph_PostClosed'
 'ph_PostDeleted' 'ph_PostUndeleted' 'ph_CommunityOwned' 'ph_PostLocked'
 'ph_PostUnlocked' 'ph_QuestionUnprotected' 'ph_QuestionProtected'
 'ph_PostNoticeRemoved' 'ph_PostNoticeAdded' 'ph_PostMergeSource'
 'ph_PostMigratedAway' 'ph_PostMergeDestination']


In [7]:
df_train.info()
print("-"*50)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112147 entries, 8441 to 43567
Data columns (total 39 columns):
Id                         112147 non-null int64
country                    112147 non-null object
AboutMe_length             72567 non-null float64
activity_in_months         112147 non-null float64
UpVotes                    112147 non-null int64
DownVotes                  112147 non-null int64
Reputation                 112147 non-null int64
Views                      112147 non-null int64
badges                     112147 non-null int64
Q_comments                 29880 non-null float64
A_comments                 29880 non-null float64
P_questions                83111 non-null float64
P_answers                  82712 non-null float64
p_ModeratorNomination      18 non-null float64
p_TagWiki                  639 non-null float64
p_TagWikiExerpt            782 non-null float64
ph_InitialTitle            30676 non-null float64
ph_EditTitle               7457 non-null float64


In [8]:
df_train.describe()

Unnamed: 0,Id,AboutMe_length,activity_in_months,UpVotes,DownVotes,Reputation,Views,badges,Q_comments,A_comments,P_questions,P_answers,p_ModeratorNomination,p_TagWiki,p_TagWikiExerpt,ph_InitialTitle,ph_EditTitle,ph_InitialBody,ph_InitialTags,ph_EditBody,ph_EditTags,ph_RollbackTitle,ph_RollbackBody,ph_PostReopened,ph_RollbackTags,ph_PostClosed,ph_PostDeleted,ph_PostUndeleted,ph_CommunityOwned,ph_PostLocked,ph_PostUnlocked,ph_QuestionUnprotected,ph_QuestionProtected,ph_PostNoticeRemoved,ph_PostNoticeAdded,ph_PostMergeSource,ph_PostMigratedAway,ph_PostMergeDestination
count,112147.0,72567.0,112147.0,112147.0,112147.0,112147.0,112147.0,112147.0,29880.0,29880.0,83111.0,82712.0,18.0,639.0,782.0,30676.0,7457.0,37948.0,30672.0,22936.0,7423.0,269.0,943.0,84.0,329.0,124.0,2914.0,2936.0,956.0,6.0,6.0,19.0,72.0,1109.0,1844.0,5.0,5.0,5.0
mean,5254029.0,103.525253,33.385235,53.412922,6.04602,544.607034,65.52754,8.762972,7.093273,14.362483,5.081337,15.440577,1.444444,3.336463,3.372123,9.698559,11.36972,30.030041,9.695716,31.791725,15.943823,3.962825,5.135737,20.02381,2.714286,167.58871,3.586479,3.626022,3.789749,115.166667,13.833333,4.526316,15.861111,2.03697,2.399132,81.2,66.0,246.8
std,3493744.0,240.822091,35.728919,366.203056,193.574696,6673.717768,1258.866708,34.372489,21.792996,120.308331,20.267219,255.790089,0.855585,8.367411,10.869657,28.613973,104.543527,176.049231,28.608055,254.843311,117.261008,11.919448,24.004614,57.844198,6.676118,695.274865,15.982254,17.237171,34.506737,179.208724,15.791348,9.057645,77.046594,3.491284,4.871225,114.707018,131.556072,340.281795
min,32.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,10.0,1.0,28.0
25%,2039419.0,0.0,2.4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,25.0,3.75,1.0,1.0,1.0,1.0,17.0,1.0,61.0
50%,4943085.0,22.0,19.4,0.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,4.0,3.0,4.0,2.0,1.0,1.0,4.0,1.0,13.0,1.0,1.0,1.0,43.5,8.5,1.0,2.0,1.0,1.0,43.0,9.0,147.0
75%,8321042.0,99.0,58.5,4.0,0.0,57.0,13.0,6.0,6.0,4.0,3.0,4.0,1.75,2.0,2.0,8.0,4.0,16.0,8.0,12.0,5.0,3.0,3.0,17.25,2.0,63.0,2.0,2.0,2.0,92.0,15.5,2.0,8.25,2.0,2.0,52.0,18.0,150.0
max,12004560.0,5687.0,134.9,32909.0,45436.0,851064.0,268394.0,4097.0,618.0,9548.0,1107.0,59042.0,4.0,107.0,227.0,1082.0,6279.0,15888.0,1080.0,17399.0,5527.0,135.0,510.0,465.0,79.0,5767.0,442.0,518.0,1019.0,474.0,44.0,37.0,654.0,56.0,143.0,284.0,301.0,848.0


In [9]:
print(df_train.columns.values)

['Id' 'country' 'AboutMe_length' 'activity_in_months' 'UpVotes'
 'DownVotes' 'Reputation' 'Views' 'badges' 'Q_comments' 'A_comments'
 'P_questions' 'P_answers' 'p_ModeratorNomination' 'p_TagWiki'
 'p_TagWikiExerpt' 'ph_InitialTitle' 'ph_EditTitle' 'ph_InitialBody'
 'ph_InitialTags' 'ph_EditBody' 'ph_EditTags' 'ph_RollbackTitle'
 'ph_RollbackBody' 'ph_PostReopened' 'ph_RollbackTags' 'ph_PostClosed'
 'ph_PostDeleted' 'ph_PostUndeleted' 'ph_CommunityOwned' 'ph_PostLocked'
 'ph_PostUnlocked' 'ph_QuestionUnprotected' 'ph_QuestionProtected'
 'ph_PostNoticeRemoved' 'ph_PostNoticeAdded' 'ph_PostMergeSource'
 'ph_PostMigratedAway' 'ph_PostMergeDestination']


In [15]:
df_train.describe(include =['O'])

Unnamed: 0,country
count,112147
unique,3
top,usa
freq,69636


In [17]:
df_train[['country', 'activity_in_months']].groupby(['country'], as_index=False).mean().sort_values(by='activity_in_months', ascending=False)

Unnamed: 0,country,activity_in_months
2,usa,36.270106
1,russia,36.041291
0,china,24.821235
