# Preliminary data analysis
- Analyze the data
- E.g., distribution of posts per subreddits (as source and as target)

## Data analysis of the training files

In [1]:
import pandas as pd

#### Read the training files

In [2]:
df=pd.read_csv('soc-redditHyperlinks-body.tsv',sep='\t')

#### Take a look at our data

In [3]:
df.head()
df.to_csv('my_dataframe.csv', columns=["SOURCE_SUBREDDIT", "TARGET_SUBREDDIT", "POST_ID", "TIMESTAMP", "LINK_SENTIMENT"])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281562 entries, 0 to 281561
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   SOURCE_SUBREDDIT  281562 non-null  object
 1   TARGET_SUBREDDIT  281562 non-null  object
 2   POST_ID           281562 non-null  object
 3   TIMESTAMP         281562 non-null  object
 4   LINK_SENTIMENT    281562 non-null  int64 
 5   PROPERTIES        281562 non-null  object
dtypes: int64(1), object(5)
memory usage: 12.9+ MB


In [5]:
print(f'earliest entry: {df["TIMESTAMP"].min()}')
print(f'latest entry: {df["TIMESTAMP"].max()}')

earliest entry: 2014-01-01 10:08:48
latest entry: 2017-04-30 16:58:21


In [6]:
print(df.shape)
print(df["LINK_SENTIMENT"].max())
print(df["LINK_SENTIMENT"].min())
print(df["LINK_SENTIMENT"].mean())
print(df["LINK_SENTIMENT"].std())
print(f'\namount of positive and negatie values in column: {df["LINK_SENTIMENT"].value_counts()}')

(281562, 6)
1
-1
0.8530483516951861
0.5218328054159836

amount of positive and negatie values in column: LINK_SENTIMENT
 1    260874
-1     20688
Name: count, dtype: int64


#### Data cleaning necessary?

In [7]:
#check whether 
df.isna().any()

SOURCE_SUBREDDIT    False
TARGET_SUBREDDIT    False
POST_ID             False
TIMESTAMP           False
LINK_SENTIMENT      False
PROPERTIES          False
dtype: bool

In [8]:
#check for duplicates in all rows
df.duplicated().any()

False

#### Check for duplicates in columns

In [9]:
print(df["SOURCE_SUBREDDIT"].duplicated().value_counts())
print()
print(df["TARGET_SUBREDDIT"].duplicated().value_counts())
print()
print(df["POST_ID"].duplicated().value_counts())
print()
print(df["TIMESTAMP"].duplicated().value_counts())
print()
print(df["PROPERTIES"].duplicated().value_counts())

SOURCE_SUBREDDIT
True     253956
False     27606
Name: count, dtype: int64

TARGET_SUBREDDIT
True     261115
False     20447
Name: count, dtype: int64

POST_ID
False    254511
True      27051
Name: count, dtype: int64

TIMESTAMP
False    244073
True      37489
Name: count, dtype: int64

PROPERTIES
False    244180
True      37382
Name: count, dtype: int64


#### Statistics for the positvie and negative values

In [10]:
df.groupby("LINK_SENTIMENT").describe()

Unnamed: 0_level_0,SOURCE_SUBREDDIT,SOURCE_SUBREDDIT,SOURCE_SUBREDDIT,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,TARGET_SUBREDDIT,TARGET_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,POST_ID,POST_ID,POST_ID,TIMESTAMP,TIMESTAMP,TIMESTAMP,TIMESTAMP,PROPERTIES,PROPERTIES,PROPERTIES,PROPERTIES
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
LINK_SENTIMENT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
-1,20688,4119,subredditdrama,1418,20688,3885,askreddit,867,20688,18547,3yj2ee,53,20688,18099,2015-12-27 20:14:14,53,20688,18244,"17234.0,16374.0,0.732041313682,0.0276778461181...",53
1,260874,27027,subredditdrama,3183,260874,19959,askreddit,6254,260874,235964,4asjoos,167,260874,227930,2014-08-20 14:40:53,360,260874,226640,"39838.0,38917.0,0.656057030975,0.064335559014,...",167


#### Some more statistics according to the "properties" column
0. Number of characters
1. Number of characters without counting white space
2. Fraction of alphabetical characters
3. Fraction of digits
4. Fraction of uppercase characters
5. Fraction of white spaces
6. Fraction of special characters, such as comma, exclamation mark, etc.
7. Number of words
8. Number of unique works
9. Number of long words (at least 6 characters)
10. Average word length
11. Number of unique stopwords
12. Fraction of stopwords
13. Number of sentences
14. Number of long sentences (at least 10 words)
15. Average number of characters per sentence
16. Average number of words per sentence
17. Automated readability index
18. Positive sentiment calculated by VADER
29. Negative sentiment calculated by VADER
20. Compound sentiment calculated by VADER

source: https://snap.stanford.edu/data/soc-RedditHyperlinks.html

In [11]:
type_index = int(input("type index number..."))
print(f"Maximum value: {df['PROPERTIES'].str.split(',').str[type_index].max()}")
print(f"Minimum value: {df['PROPERTIES'].str.split(',').str[type_index].min()}")

type index number...0
Maximum value: 9999.0
Minimum value: 100.0


### Data analysis of the test files

#### Read the test files

In [12]:
df2=pd.read_csv('soc-redditHyperlinks-body-test.tsv',sep='\t')

#### Take a look at our data

In [13]:
df2.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,gaming4gamers,fallout,1u5q84s,2014-01-01 06:55:04,1,"102.0,102.0,0.803921568627,0.0294117647059,0.0..."
1,kpop,dota2,1u5qg2s,2014-01-01 07:05:10,1,"1050.0,938.0,0.705714285714,0.0419047619048,0...."
2,airsoft,airsoftmarket,1u5r7js,2014-01-01 07:09:16,1,"3788.0,3206.0,0.737856388596,0.0145195353749,0..."
3,circlebroke,childfree,1u5rs9s,2014-01-01 06:51:30,1,"1007.0,871.0,0.781529294935,0.00794438927507,0..."
4,tribes,games,1u5syks,2014-01-01 09:06:30,1,"1316.0,1108.0,0.788753799392,0.00303951367781,..."


In [14]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   SOURCE_SUBREDDIT  4999 non-null   object
 1   TARGET_SUBREDDIT  4999 non-null   object
 2   POST_ID           4999 non-null   object
 3   TIMESTAMP         4999 non-null   object
 4   LINK_SENTIMENT    4999 non-null   int64 
 5   PROPERTIES        4999 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [73]:
print(f'shape:{df2.shape}')
print(f'max value: {df2["LINK_SENTIMENT"].max()}')
print(f'min value: {df2["LINK_SENTIMENT"].min()}')
print(f'mean value: {df2["LINK_SENTIMENT"].mean()}')
print(f'standard deviation value: {df2["LINK_SENTIMENT"].std()}')
print(f'\namount of positive and negatie values in column: {df2["LINK_SENTIMENT"].value_counts()}')

print()

print(f'amount of positive values (test data): {df2["LINK_SENTIMENT"].value_counts()[1]/df2.shape[0]}')
print(f'amount of negative values (test data): {df2["LINK_SENTIMENT"].value_counts()[-1]/df2.shape[0]}')
print(f'amount of positive values (training data): {df["LINK_SENTIMENT"].value_counts()[1]/df.shape[0]}')
print(f'amount of negative values (training data): {df["LINK_SENTIMENT"].value_counts()[-1]/df.shape[0]}')
print(f'differences test data/training data (positive values): {abs(df2["LINK_SENTIMENT"].value_counts()[1]/df2.shape[0]-df["LINK_SENTIMENT"].value_counts()[1]/df.shape[0])}')
print(f'differences test data/training data (negative values): {abs(df2["LINK_SENTIMENT"].value_counts()[-1]/df2.shape[0]-df["LINK_SENTIMENT"].value_counts()[-1]/df.shape[0])}')


print()

print('deviations/relations test data/training data (mean, standard deviation):')
print(f'mean relation: {df2["LINK_SENTIMENT"].mean()/df["LINK_SENTIMENT"].mean()}')
print(f'mean deviation: {abs(1-df2["LINK_SENTIMENT"].mean()/df["LINK_SENTIMENT"].mean())}')
print(f'std relation: {df["LINK_SENTIMENT"].std()/df2["LINK_SENTIMENT"].std()}')
print(f'std deviation: {abs(1-df2["LINK_SENTIMENT"].std()/df["LINK_SENTIMENT"].std())}')

shape:(4999, 6)
max value: 1
min value: -1
mean value: 0.8471694338867773
standard deviation value: 0.5313759814588873

amount of positive and negatie values in column: LINK_SENTIMENT
 1    4617
-1     382
Name: count, dtype: int64

amount of positive values (test data): 0.9235847169433887
amount of negative values (test data): 0.07641528305661133
amount of positive values (training data): 0.9265241758475931
amount of negative values (training data): 0.07347582415240693
differences test data/training data (positive values): 0.0029394589042043284
differences test data/training data (negative values): 0.002939458904204398

deviations/relations test data/training data (mean, standard deviation):
mean relation: 0.9931083416353527
mean deviation: 0.006891658364647335
std relation: 0.9820406334198566
std deviation: 0.018287803955322923


#### Data cleaning necessary?

In [16]:
#check whether 
df2.isna().any()

SOURCE_SUBREDDIT    False
TARGET_SUBREDDIT    False
POST_ID             False
TIMESTAMP           False
LINK_SENTIMENT      False
PROPERTIES          False
dtype: bool

In [17]:
#check for duplicates in all rows
df2.duplicated().any()

False

#### Check for duplicates in columns

In [18]:
print(df2["SOURCE_SUBREDDIT"].duplicated().value_counts())
print()
print(df2["TARGET_SUBREDDIT"].duplicated().value_counts())
print()
print(df2["POST_ID"].duplicated().value_counts())
print()
print(df2["TIMESTAMP"].duplicated().value_counts())
print()
print(df2["PROPERTIES"].duplicated().value_counts())

SOURCE_SUBREDDIT
True     3168
False    1831
Name: count, dtype: int64

TARGET_SUBREDDIT
True     3487
False    1512
Name: count, dtype: int64

POST_ID
False    4581
True      418
Name: count, dtype: int64

TIMESTAMP
False    4392
True      607
Name: count, dtype: int64

PROPERTIES
False    4490
True      509
Name: count, dtype: int64


#### Statistics for the positvie and negative values

In [86]:
df2.groupby("LINK_SENTIMENT").describe()

Unnamed: 0_level_0,SOURCE_SUBREDDIT,SOURCE_SUBREDDIT,SOURCE_SUBREDDIT,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,TARGET_SUBREDDIT,TARGET_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,POST_ID,POST_ID,POST_ID,TIMESTAMP,TIMESTAMP,TIMESTAMP,TIMESTAMP,PROPERTIES,PROPERTIES,PROPERTIES,PROPERTIES
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
LINK_SENTIMENT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
-1,382,184,circlebroke,31,382,207,askreddit,25,382,349,1v3273,9,382,344,2014-01-12 20:30:55,9,382,347,"9956.0,8601.0,0.787364403375,0.00693049417437,...",13
1,4617,1786,dailydot,140,4617,1458,askreddit,183,4617,4232,1wt4ots,23,4617,4090,2014-01-17 14:40:58,124,4617,4151,"3992.0,3774.0,0.740731462926,0.0225450901804,0...",23


#### Some more statistics according to the "properties" column
0. Number of characters
1. Number of characters without counting white space
2. Fraction of alphabetical characters
3. Fraction of digits
4. Fraction of uppercase characters
5. Fraction of white spaces
6. Fraction of special characters, such as comma, exclamation mark, etc.
7. Number of words
8. Number of unique works
9. Number of long words (at least 6 characters)
10. Average word length
11. Number of unique stopwords
12. Fraction of stopwords
13. Number of sentences
14. Number of long sentences (at least 10 words)
15. Average number of characters per sentence
16. Average number of words per sentence
17. Automated readability index
18. Positive sentiment calculated by VADER
29. Negative sentiment calculated by VADER
20. Compound sentiment calculated by VADER

source: https://snap.stanford.edu/data/soc-RedditHyperlinks.html

In [84]:
type_index = int(input("type index number..."))
print(f"Maximum value: {df2['PROPERTIES'].str.split(',').str[type_index].max()}")
print(f"Minimum value: {df2['PROPERTIES'].str.split(',').str[type_index].min()}")

type index number...10
Maximum value: 9.9
Minimum value: 10.0
